FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn 00012 // Created : 2006/6/10 00013 // 00014 00015 #ifndef _DATEFIELDINDEXER_H 00016 #define _DATEFIELDINDEXER_H 00017 00018 #include "../utility/StdHeader.h" 00019 #include "FieldIndexer.h" 00020 #include "Posting.h" 00021 #include "Vocabulary.h" 00022 00023 using namespace std; 00024 using namespace firtex; 00025 00026 #ifndef WIN32 00027 namespace __gnu_cxx 00028 { 00029 template<> struct hash< int64_t > 00030 { 00031 size_t operator()( int64_t __x ) const 00032 { 00033 return __x; 00034 } 00035 }; 00036 } 00037 #endif 00038 00039 namespace firtex 00040 { 00041 namespace index 00042 { 00043 00044 00045 template <typename TermT> 00046 class CDateVocabularyBuilder 00047 { 00048 public: 00049 typedef TermT term_type; 00050 protected: 00051 typedef struct _table_array 00052 { 00053 term_type tid; 00054 CPosting* posting; 00055 }table_array; 00056 public: 00057 CDateVocabularyBuilder(); 00058 CDateVocabularyBuilder(CPosMemCache* mc); 00059 ~CDateVocabularyBuilder(); 00060 public: 00061 void save(CIndexOutputDescriptor* pOutputDesc); 00062 CPosting* find(term_type t); 00063 00064 void clear(); 00065 count_t distinctNumTerms(){return (count_t)m_table.size();} 00066 protected: 00067 void quickSort(table_array lex[], int32_t lo, int32_t hi); 00068 protected: 00069 hash_map<term_type,CPosting*> m_table; 00070 CPosMemCache* m_memcache; 00071 00072 typedef pair<term_type,CPosting*> table_item_pair; 00073 typedef typename hash_map<term_type,CPosting*>::iterator table_iterator; 00074 }; 00075 00076 00078 template <typename TermT> 00079 class CDateVocabularyLoader 00080 { 00081 public: 00082 typedef TermT term_type; 00083 public: 00084 class term_table 00085 { 00086 public: 00087 term_table(){} 00088 ~term_table(){} 00089 term_type tid; 00090 CTermInfo ti; 00091 }; 00092 typedef term_table term_table_type; 00093 typedef term_table* term_table_ptr; 00094 00095 class CDateVocabularyLoaderIterator 00096 { 00097 public: 00098 CDateVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms); 00099 CDateVocabularyLoaderIterator(const CDateVocabularyLoaderIterator& clone); 00100 ~CDateVocabularyLoaderIterator(); 00101 00102 bool next(); 00103 bool skipTo(term_type t); 00104 term_type term(); 00105 CTermInfo* second(); 00106 protected: 00107 count_t m_termCount; 00108 term_table_ptr m_ptermtable; 00109 int32_t m_curPos; 00110 }; 00111 typedef CDateVocabularyLoaderIterator loader_iterator; 00112 public: 00113 CDateVocabularyLoader(); 00114 ~CDateVocabularyLoader(); 00115 public: 00117 void load(CIndexInput* pIndexInput); 00124 CTermInfo* find(term_type t); 00125 00129 loader_iterator terms(); 00130 00132 void clear(); 00133 00135 count_t distinctNumTerms(){return m_termCount;} 00136 protected: 00137 count_t m_termCount; 00138 term_table_ptr m_ptermtable; 00139 }; 00140 00141 typedef CVocabulary<int64_t,CDateVocabularyBuilder<int64_t>,CDateVocabularyLoader<int64_t> > CDateVocabulary; 00142 00143 class CDateFieldIndexer : public CFieldIndexer 00144 { 00145 public: 00146 CDateFieldIndexer(CPosMemCache* pMemCache); 00147 virtual ~CDateFieldIndexer(void); 00148 public: 00154 void addField(docid_t did,CField* pField); 00155 00159 void write(CIndexOutputDescriptor* pWriterDesc); 00160 00164 void setFilePointer(fileoffset_t off){m_tdiFilePointer = off;}; 00165 fileoffset_t getFilePointer(){return m_tdiFilePointer;}; 00166 00170 uint64_t distinctNumTerms(); 00171 protected: 00172 CDateVocabulary* m_pDictionary; 00173 00174 CPosMemCache* m_pMemCache; //内存缓冲 00175 fileoffset_t m_tdiFilePointer; 00176 }; 00177 00180 // 00181 template <typename TermT> 00182 CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::CDateVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms) 00183 :m_ptermtable(tbl) 00184 ,m_termCount(numTerms) 00185 ,m_curPos(-1) 00186 { 00187 } 00188 00189 template <typename TermT> 00190 CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::CDateVocabularyLoaderIterator(const CDateVocabularyLoaderIterator& clone) 00191 :m_ptermtable(clone.m_ptermtable) 00192 ,m_termCount(clone.m_termCount) 00193 ,m_curPos(clone.m_curPos) 00194 { 00195 } 00196 template <typename TermT> 00197 CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::~CDateVocabularyLoaderIterator() 00198 { 00199 } 00200 00201 template <typename TermT> 00202 bool CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::next() 00203 { 00204 m_curPos++; 00205 if(m_curPos < m_termCount) 00206 return true; 00207 return false; 00208 } 00209 template <typename TermT> 00210 bool CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::skipTo(TermT t) 00211 { 00212 int32_t k; 00213 int32_t start = 0,end = m_termCount - 1; 00214 int32_t nk = end; 00215 while (start<=end) 00216 { 00217 k = (start + end)/2; 00218 if(t == m_ptermtable[k].tid)//找到 00219 { 00220 nk = k; 00221 m_curPos = nk; 00222 return true; 00223 } 00224 if(t < m_ptermtable[k].tid)//查找左半边 00225 { 00226 end = k - 1; 00227 if(k >= start) 00228 { 00229 nk =k; 00230 } 00231 } 00232 else //查找右半边 00233 { 00234 start = k + 1; 00235 if(start <= end) 00236 { 00237 if(m_ptermtable[start].tid > t) 00238 { 00239 nk = start; 00240 } 00241 } 00242 } 00243 } 00244 if(m_ptermtable[nk].tid < t) 00245 { 00246 m_curPos = -1; 00247 return false; 00248 } 00249 m_curPos = nk; 00250 return true; 00251 } 00252 template <typename TermT> 00253 TermT CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::term() 00254 { 00255 return m_ptermtable[m_curPos].tid; 00256 } 00257 template <typename TermT> 00258 CTermInfo* CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::second() 00259 { 00260 return &(m_ptermtable[m_curPos].ti); 00261 } 00262 00263 template <typename TermT> 00264 CDateVocabularyLoader<TermT>::CDateVocabularyLoader() 00265 { 00266 } 00267 template <typename TermT> 00268 CDateVocabularyLoader<TermT>::~CDateVocabularyLoader() 00269 { 00270 00271 } 00272 00273 template <typename TermT> 00274 void CDateVocabularyLoader<TermT>::load(CIndexInput* pIndexInput) 00275 { 00276 m_termCount = pIndexInput->readInt(); 00277 m_ptermtable = new term_table_type[m_termCount]; 00278 freq_t df; 00279 fileoffset_t dfiP; 00280 for (int32_t i = 0;i < m_termCount;i++) 00281 { 00282 m_ptermtable[i].tid = pIndexInput->readVInt(); 00283 df = pIndexInput->readVInt(); 00284 dfiP = pIndexInput->readVLong(); 00285 m_ptermtable[i].ti.set(df,dfiP,0); 00286 } 00287 } 00288 template <typename TermT> 00289 CTermInfo* CDateVocabularyLoader<TermT>::find(TermT t) 00290 { 00291 int32_t start = 0,end = m_termCount-1; 00292 int32_t mid = (start + end)/2; 00293 while (start <= end) 00294 { 00295 mid = (start + end)/2; 00296 if(m_ptermtable[mid].tid == t) 00297 { 00298 return &(m_ptermtable[mid].ti); 00299 } 00300 if(m_ptermtable[mid].tid > t) 00301 { 00302 end = mid - 1; 00303 } 00304 else 00305 { 00306 start = mid + 1; 00307 } 00308 } 00309 return NULL; 00310 } 00311 00312 template <typename TermT> 00313 void CDateVocabularyLoader<TermT>::clear() 00314 { 00315 if(m_ptermtable) 00316 { 00317 delete[] m_ptermtable; 00318 m_ptermtable = NULL; 00319 } 00320 m_termCount = 0; 00321 } 00322 template <typename TermT> 00323 typename CDateVocabularyLoader<TermT>::loader_iterator CDateVocabularyLoader<TermT>::terms() 00324 { 00325 return typename CDateVocabularyLoader<TermT>::loader_iterator(m_ptermtable,m_termCount); 00326 } 00328 // 00329 template <typename TermT> 00330 CDateVocabularyBuilder<TermT>::CDateVocabularyBuilder() 00331 :m_memcache(NULL) 00332 { 00333 00334 } 00335 template <typename TermT> 00336 CDateVocabularyBuilder<TermT>::CDateVocabularyBuilder(CPosMemCache* mc) 00337 :m_memcache(mc) 00338 { 00339 } 00340 template <typename TermT> 00341 CDateVocabularyBuilder<TermT>::~CDateVocabularyBuilder() 00342 { 00343 00344 } 00345 00346 template <typename TermT> 00347 void CDateVocabularyBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc) 00348 { 00349 CIndexOutput* tdiWriter = pOutputDesc->tdiWriter; 00350 CIndexOutput* dfiWriter = pOutputDesc->dfiWriter; 00351 00352 if(m_table.size() <= 0) 00353 return; 00354 00355 CPosting* pPost = NULL; 00356 00357 table_array* lex = new table_array[m_table.size()]; 00358 int32_t len = 0; 00359 table_iterator iter = m_table.begin(); 00360 while (iter != m_table.end()) 00361 { 00362 pPost = iter->second; 00363 if(!pPost->hasNoMem()) 00364 { 00365 lex[len].tid = iter->first; 00366 lex[len].posting = pPost; 00367 len ++; 00368 } 00369 iter ++; 00370 } 00371 00372 quickSort(lex,0,len-1);//快速排序 00373 00374 tdiWriter->writeInt(len);//词典长度 00375 for (int32_t i = 0;i < len;i++) 00376 { 00377 tdiWriter->writeVLong(lex[i].tid); //write termid 00378 pPost = lex[i].posting; 00379 00380 tdiWriter->writeVInt(pPost->docFreq()); //write df 00381 tdiWriter->writeVLong(dfiWriter->getFilePointer()); //write doc freq pos 00382 00383 pPost->writeDocFreq(dfiWriter); //write doc freq 00384 00385 pPost->reset(); //清空Posting 00386 } 00387 00388 delete[] lex; 00389 } 00390 template <typename TermT> 00391 inline CPosting* CDateVocabularyBuilder<TermT>::find(term_type t) 00392 { 00393 table_iterator iter = m_table.find(t); 00394 if(iter == m_table.end()) 00395 { 00396 CPosting* p = new CPosting(m_memcache); 00397 m_table.insert(table_item_pair(t,p)); 00398 return p; 00399 } 00400 return iter->second; 00401 } 00402 template <typename TermT> 00403 void CDateVocabularyBuilder<TermT>::clear() 00404 { 00405 table_iterator iter = m_table.begin(); 00406 while (iter != m_table.end()) 00407 { 00408 delete iter->second; 00409 iter++; 00410 } 00411 m_table.clear(); 00412 } 00413 template <typename TermT> 00414 void CDateVocabularyBuilder<TermT>::quickSort(table_array lex[], int32_t lo, int32_t hi) 00415 { 00416 if (lo >= hi) 00417 return; 00418 00419 int32_t mid = (lo + hi) / 2; 00420 table_array tmp; 00421 00422 if (lex[lo].tid > lex[mid].tid) 00423 { 00424 tmp = lex[lo]; 00425 lex[lo] = lex[mid]; 00426 lex[mid] = tmp; 00427 } 00428 00429 if (lex[mid].tid > lex[hi].tid) 00430 { 00431 tmp = lex[mid]; 00432 lex[mid] = lex[hi]; 00433 lex[hi] = tmp; 00434 00435 if (lex[lo].tid > lex[mid].tid) 00436 { 00437 tmp = lex[lo]; 00438 lex[lo] = lex[mid]; 00439 lex[mid] = tmp; 00440 } 00441 } 00442 00443 int32_t left = lo + 1; 00444 int32_t right = hi - 1; 00445 00446 if (left >= right) 00447 return; 00448 00449 term_type partition = lex[mid].tid; 00450 00451 for (; ;) 00452 { 00453 while (lex[right].tid > partition) 00454 --right; 00455 00456 while ( (left < right) && (lex[left].tid <= partition)) 00457 ++left; 00458 00459 if (left < right) 00460 { 00461 tmp = lex[left]; 00462 lex[left] = lex[right]; 00463 lex[right] = tmp; 00464 --right; 00465 } 00466 else 00467 { 00468 break; 00469 } 00470 } 00471 00472 quickSort(lex, lo, left); 00473 quickSort(lex, left + 1, hi); 00474 } 00475 } 00476 } 00477 00478 00479 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex