FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com 00012 // Created : 2006/8/2 00013 // 00014 #ifndef _TRECCOLLECTION_H 00015 #define _TRECCOLLECTION_H 00016 00017 #if _MSC_VER > 1000 00018 #pragma once 00019 #endif // _MSC_VER > 1000 00020 00021 #include "Collection.h" 00022 00023 #define MAX_TAGS 10 00024 00025 namespace firtex 00026 { 00027 namespace collection 00028 { 00029 class CTrecCollection : public CCollection 00030 { 00031 public: 00032 static const tstring identifier; 00033 protected: 00034 class CTrecTagPair 00035 { 00036 public: 00037 CTrecTagPair(schemaid_t sid,const tchar* tag) 00038 :schemaid(sid) 00039 { 00040 tagLen = (int32_t)_tcslen(tag); 00041 beginTag = new tchar[tagLen + 3]; 00042 beginTag[0] = _T('<'); 00043 _tcscpy(beginTag + 1,tag); 00044 beginTag[tagLen + 1] = _T('>'); 00045 beginTag[tagLen + 2] = 0; 00046 endTag = new tchar[_tcslen(tag) + 4]; 00047 _tcscpy(endTag + 2,tag); 00048 endTag[0] = _T('<'); 00049 endTag[1] = _T('/'); 00050 endTag[tagLen + 2] = _T('>'); 00051 endTag[tagLen + 3] = 0; 00052 } 00053 ~CTrecTagPair() 00054 { 00055 delete[] beginTag; 00056 delete[] endTag; 00057 } 00058 public: 00059 schemaid_t schemaid; 00060 tchar* beginTag; 00061 tchar* endTag; 00062 int32_t tagLen; 00063 }; 00064 public: 00065 CTrecCollection(void); 00066 CTrecCollection(const tchar* dir,CIndexWriter* pWriter); 00067 virtual ~CTrecCollection(void); 00068 public: 00072 bool scanInternal(); 00073 00078 void parseFile(const tstring& filename); 00079 protected: 00083 void loadSchema(const tstring& schemaFile,CDocumentSchema& schema); 00084 bool processTag(char*& start,char* end,CTrecTagPair* p,char*& value,size_t& valueLen,bool skip); 00085 00086 void doParse(); 00087 protected: 00088 tstring m_sDirectory; 00089 CTrecTagPair** m_tags; 00090 int32_t m_numTags; 00091 char* m_contentBuffer; 00092 int32_t m_length; 00093 }; 00095 // 00096 inline bool CTrecCollection::processTag(char*& start,char* end,CTrecTagPair* p,char*& value,size_t& valueLen,bool skip) 00097 { 00098 while(*start == '\n' || *start == ' ') 00099 start++; 00100 if(strncmp(start,p->beginTag,p->tagLen+2)) 00101 return false; 00102 00103 char* eTag = strstr(start,p->endTag); 00104 if(eTag == NULL) 00105 { 00106 FIRTEX_CLOG(level::warn) << "can't find end tag : " << p->endTag<<endl; 00107 do 00108 { 00109 start += p->tagLen + 3; 00110 eTag = strstr(start,p->endTag); 00111 } while(start < end && !eTag); 00112 if(eTag == NULL) 00113 return false; 00114 } 00115 char* bTag = (start + p->tagLen + 2); 00116 if(!skip) 00117 { 00118 value = bTag; 00119 valueLen = eTag - bTag; 00120 } 00121 start = eTag + p->tagLen + 3; 00122 return true; 00123 } 00124 } 00125 } 00126 00127 00128 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex