FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

TrecCollection.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : 郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com
00012 // Created      : 2006/8/2
00013 //
00014 #ifndef _TRECCOLLECTION_H
00015 #define _TRECCOLLECTION_H
00016 
00017 #if _MSC_VER > 1000
00018 #pragma once
00019 #endif // _MSC_VER > 1000
00020 
00021 #include "Collection.h"
00022 
00023 #define  MAX_TAGS 10
00024 
00025 namespace firtex
00026 {
00027         namespace collection
00028         {
00029                 class CTrecCollection : public CCollection
00030                 {
00031                 public:
00032                         static const tstring identifier;
00033                 protected:
00034                         class CTrecTagPair
00035                         {
00036                         public:
00037                                 CTrecTagPair(schemaid_t sid,const tchar* tag)
00038                                         :schemaid(sid)
00039                                 {
00040                                         tagLen = (int32_t)_tcslen(tag);
00041                                         beginTag = new tchar[tagLen + 3];
00042                                         beginTag[0] = _T('<');
00043                                         _tcscpy(beginTag + 1,tag);
00044                                         beginTag[tagLen + 1] = _T('>');
00045                                         beginTag[tagLen + 2] = 0;
00046                                         endTag = new tchar[_tcslen(tag) + 4];
00047                                         _tcscpy(endTag + 2,tag);
00048                                         endTag[0] = _T('<');
00049                                         endTag[1] = _T('/');
00050                                         endTag[tagLen + 2] = _T('>');
00051                                         endTag[tagLen + 3] = 0;
00052                                 }
00053                                 ~CTrecTagPair()
00054                                 {
00055                                         delete[] beginTag;
00056                                         delete[] endTag;
00057                                 }
00058                         public:
00059                                 schemaid_t      schemaid;
00060                                 tchar*          beginTag;
00061                                 tchar*          endTag;
00062                                 int32_t         tagLen;
00063                         };
00064                 public:
00065                         CTrecCollection(void);
00066                         CTrecCollection(const tchar* dir,CIndexWriter* pWriter);
00067                         virtual ~CTrecCollection(void);
00068                 public:
00072                         bool            scanInternal();
00073                         
00078                         void            parseFile(const tstring& filename);
00079                 protected:
00083                         void            loadSchema(const tstring& schemaFile,CDocumentSchema& schema);  
00084                         bool            processTag(char*& start,char* end,CTrecTagPair* p,char*& value,size_t& valueLen,bool skip);
00085                         
00086                         void            doParse();
00087                 protected:                      
00088                         tstring                 m_sDirectory;
00089                         CTrecTagPair**  m_tags;
00090                         int32_t                 m_numTags;
00091                         char*                   m_contentBuffer;
00092                         int32_t                 m_length;
00093                 };
00095                 //              
00096                 inline bool CTrecCollection::processTag(char*& start,char* end,CTrecTagPair* p,char*& value,size_t& valueLen,bool skip)
00097                 {                       
00098                         while(*start == '\n' || *start == ' ')
00099                                 start++;
00100                         if(strncmp(start,p->beginTag,p->tagLen+2))
00101                                 return false;                   
00102                         
00103                         char* eTag = strstr(start,p->endTag);
00104                         if(eTag == NULL)
00105                         {
00106                                 FIRTEX_CLOG(level::warn) << "can't find end tag : " << p->endTag<<endl;
00107                                 do 
00108                                 {
00109                                         start += p->tagLen + 3;
00110                                         eTag = strstr(start,p->endTag);
00111                                 } while(start < end && !eTag);
00112                                 if(eTag == NULL)
00113                                         return false;
00114                         }       
00115                         char* bTag = (start + p->tagLen + 2);
00116                         if(!skip)
00117                         {
00118                                 value = bTag;
00119                                 valueLen = eTag - bTag;                         
00120                         }                       
00121                         start = eTag + p->tagLen + 3;
00122                         return true;
00123                 }               
00124         }
00125 }
00126 
00127 
00128 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex