FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com 00012 // Created : 2005/12/3 00013 // 00014 #ifndef _WORDFIELDMERGER_H 00015 #define _WORDFIELDMERGER_H 00016 00017 #include "../utility/StdHeader.h" 00018 #include <vector> 00019 #include "FieldMerger.h" 00020 #include "../store/IndexInput.h" 00021 #include "TermInfo.h" 00022 using namespace std; 00023 using namespace firtex::store; 00024 00025 00026 namespace firtex 00027 { 00028 namespace index 00029 { 00030 struct CWordFieldMergeEntry:public CFieldMergeEntry 00031 { 00032 public: 00033 CWordFieldMergeEntry(CBarrelInfo* pBarrelInfo,CFieldInfo* pFieldInfo):CFieldMergeEntry(pBarrelInfo,pFieldInfo) 00034 { 00035 count = -1; 00036 tid = -1; 00037 firstDocID = -1; 00038 lastDocID = -1; 00039 00040 tdiReader = NULL; 00041 dfiReader = NULL; 00042 ptiReader = NULL; 00043 00044 dfiLen = 0; 00045 ptiLen = 0; 00046 cur = 0; 00047 } 00048 ~CWordFieldMergeEntry() 00049 { 00050 if(tdiReader != NULL) 00051 { 00052 tdiReader->close(); 00053 delete tdiReader; 00054 } 00055 if(dfiReader != NULL) 00056 { 00057 dfiReader->close(); 00058 delete dfiReader; 00059 } 00060 if(ptiReader != NULL) 00061 { 00062 ptiReader->close(); 00063 delete ptiReader; 00064 } 00065 } 00066 bool open(CDirectory* pDirectory,char* buf,size_t bufsize) 00067 { 00068 if(bufsize > 3*INDEXINPUT_BUFFSIZE) 00069 { 00070 size_t len = bufsize/3; 00071 tdiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".tdi",buf,len); 00072 buf += len; 00073 dfiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".dfi",buf,len); 00074 buf += len; 00075 ptiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".pti",buf,len); 00076 } 00077 else 00078 { 00079 tdiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".tdi"); 00080 dfiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".dfi"); 00081 ptiReader = pDirectory->openInput(m_pBarrelInfo->name() + ".pti"); 00082 } 00083 int64_t tLen,dLen,pLen; 00084 m_pFieldInfo->getLength(&tLen,&dLen,&pLen); 00085 if(tLen <= 0) 00086 { 00087 count = 0; 00088 delete tdiReader; 00089 tdiReader = NULL; 00090 delete dfiReader; 00091 dfiReader = NULL; 00092 delete ptiReader; 00093 ptiReader = NULL; 00094 return false; 00095 } 00096 00097 00098 tdiReader->seek(m_pFieldInfo->getIndexOffset()); 00099 tdiReader->setLength(tdiReader->getFilePointer() + tLen); 00100 count = tdiReader->readInt(); //读Term总数 00101 if(count <= 0) 00102 return false; 00104 tid = tdiReader->readVInt(); 00105 count_t df = tdiReader->readVInt(); 00106 fileoffset_t of1 = tdiReader->readVLong(); 00107 fileoffset_t of2 = tdiReader->readVLong(); 00108 ti.set(df,of1,of2); 00109 00110 dfiReader->seek(of1); 00111 ptiReader->seek(of2); 00112 00113 dfiReader->setLength(dfiReader->getFilePointer() + dLen); 00114 ptiReader->setLength(ptiReader->getFilePointer() + pLen); 00115 00116 ptiLen = ptiReader->readVInt(); 00117 lastDocID = m_pBarrelInfo->minDocID() + dfiReader->readVInt(); 00118 dfiLen = dfiReader->readVInt(); 00119 firstDocID = m_pBarrelInfo->minDocID() + dfiReader->readVInt(); 00120 cur++; 00121 return true; 00122 } 00123 bool next() 00124 { 00125 if(cur >= count) 00126 { 00127 return false; 00128 tid = -1; 00129 } 00130 tid = tdiReader->readVInt(); 00131 count_t df = tdiReader->readVInt(); 00132 fileoffset_t of1 = tdiReader->readVLong(); 00133 fileoffset_t of2 = tdiReader->readVLong(); 00134 ti.set(df,of1,of2); 00135 dfiReader->seek(of1); 00136 ptiReader->seek(of2); 00137 00138 ptiLen = ptiReader->readVInt(); 00139 lastDocID = m_pBarrelInfo->minDocID() + dfiReader->readVInt(); 00140 dfiLen = dfiReader->readVInt(); 00141 firstDocID = m_pBarrelInfo->minDocID() + dfiReader->readVInt(); 00142 cur++; 00143 return true; 00144 } 00145 protected: 00146 CWordFieldMergeEntry() 00147 { 00148 } 00149 public: 00150 count_t count; //Term Count 00151 termid_t tid; //Term ID 00152 CTermInfo ti; //Term Info 00153 docid_t firstDocID; //Term的第一个文档编号 00154 docid_t lastDocID; //Term的最后一个文档编号 00155 00156 CIndexInput* tdiReader; 00157 CIndexInput* dfiReader; 00158 CIndexInput* ptiReader; 00159 00160 int dfiLen; //dfi文件Block长度 00161 int ptiLen; //pti文件Block长度 00162 00163 int cur; //当前处理的Term位置 00164 friend class CWordFieldMerger; 00165 }; 00166 00167 class CWordFieldMerger : public CFieldMerger 00168 { 00169 public: 00170 CWordFieldMerger(void); 00171 virtual ~CWordFieldMerger(void); 00172 public: 00173 void addField(CBarrelInfo* pBarrelInfo,CFieldInfo* pFieldInfo); 00174 count_t merge(CDirectory* pDirectory,CIndexOutputDescriptor* pDesc); 00175 protected: 00176 void close(); 00177 protected: 00178 vector<CWordFieldMergeEntry*> m_mergeFields; 00179 }; 00180 } 00181 } 00182 00183 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex