FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn 00012 // Created : 2006/7/17 00013 // 00014 #ifndef _VOCABULARY_H 00015 #define _VOCABULARY_H 00016 00017 #if _MSC_VER > 1000 00018 #pragma once 00019 #endif // _MSC_VER > 1000 00020 00021 #include "../utility/Config.h" 00022 #include "../store/IndexInput.h" 00023 #include "../store/IndexOutput.h" 00024 #include "FieldIndexer.h" 00025 #include "Posting.h" 00026 #include "TermInfo.h" 00027 #include "../utility/DynamicArray.h" 00028 00029 #ifdef WIN32 00030 #include <hash_map> 00031 using namespace stdext; 00032 #else 00033 #include <ext/hash_map> 00034 using namespace __gnu_cxx; 00035 #endif 00036 00037 using namespace firtex::store; 00038 using namespace firtex::utility; 00039 00040 00041 namespace firtex 00042 { 00043 namespace index 00044 { 00046 template <typename TermT> 00047 class CHashMapBuilder 00048 { 00049 public: 00050 typedef TermT term_type; 00051 protected: 00052 typedef struct _table_array 00053 { 00054 term_type tid; 00055 CPosting* posting; 00056 }table_array; 00057 public: 00058 CHashMapBuilder(); 00059 CHashMapBuilder(CPosMemCache* mc); 00060 ~CHashMapBuilder(); 00061 public: 00062 void save(CIndexOutputDescriptor* pOutputDesc); 00063 CPosting* find(term_type t); 00064 00065 void clear(); 00066 count_t distinctNumTerms(){return (count_t)m_table.size();} 00067 protected: 00068 void quickSort(table_array lex[], int32_t lo, int32_t hi); 00069 protected: 00070 hash_map<term_type,CPosting*> m_table; 00071 CPosMemCache* m_memcache; 00072 00073 typedef pair<term_type,CPosting*> table_item_pair; 00074 typedef typename hash_map<term_type,CPosting*>::iterator table_iterator; 00075 }; 00076 00077 template <typename TermT> 00078 class CDynamicArrayBuilder 00079 { 00080 public: 00081 typedef TermT term_type; 00082 protected: 00083 public: 00084 CDynamicArrayBuilder(); 00085 CDynamicArrayBuilder(CPosMemCache* mc); 00086 ~CDynamicArrayBuilder(); 00087 public: 00088 void save(CIndexOutputDescriptor* pOutputDesc); 00089 CPosting* find(term_type t); 00090 00091 void clear(); 00092 count_t distinctNumTerms(){return (count_t)m_array.length();} 00093 protected: 00094 CDynamicArray<CPosting*,Const_NullValue<CPosting*> > m_array; 00095 CPosMemCache* m_memcache; 00096 }; 00097 00098 00100 template <typename TermT> 00101 class CVocabularyLoader 00102 { 00103 public: 00104 typedef TermT term_type; 00105 public: 00106 class term_table 00107 { 00108 public: 00109 term_table(){} 00110 ~term_table(){} 00111 term_type tid; 00112 CTermInfo ti; 00113 }; 00114 typedef term_table term_table_type; 00115 typedef term_table* term_table_ptr; 00116 00117 class CVocabularyLoaderIterator 00118 { 00119 public: 00120 CVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms); 00121 CVocabularyLoaderIterator(const CVocabularyLoaderIterator& clone); 00122 ~CVocabularyLoaderIterator(); 00123 00124 bool next(); 00125 bool skipTo(term_type t); 00126 term_type term(); 00127 CTermInfo* second(); 00128 protected: 00129 count_t m_termCount; 00130 term_table_ptr m_ptermtable; 00131 int32_t m_curPos; 00132 }; 00133 typedef CVocabularyLoaderIterator loader_iterator; 00134 public: 00135 CVocabularyLoader(); 00136 ~CVocabularyLoader(); 00137 public: 00139 void load(CIndexInput* pIndexInput); 00146 CTermInfo* find(term_type t); 00147 00151 loader_iterator terms(); 00152 00154 void clear(); 00155 00157 count_t distinctNumTerms(){return m_termCount;} 00158 protected: 00159 count_t m_termCount; 00160 term_table_ptr m_ptermtable; 00161 }; 00162 00164 template<typename TermT,typename BuilderT=CHashMapBuilder<TermT> ,typename LoaderT = CVocabularyLoader<TermT> > 00165 class CVocabulary 00166 { 00167 public: 00168 typedef BuilderT builder_type; 00169 typedef LoaderT loader_type; 00170 typedef typename LoaderT::loader_iterator vacabulary_iterator; 00171 public: 00172 CVocabulary(CPosMemCache* mc); 00173 CVocabulary(void); 00174 ~CVocabulary(void); 00175 public: 00176 inline BuilderT* getBuilder(); 00177 inline LoaderT* getLoader(); 00178 00180 void load(CIndexInput* pIndexInput); 00181 00183 void save(CIndexOutputDescriptor* pOutputDesc); 00184 00186 vacabulary_iterator terms() 00187 { 00188 FIRTEX_ASSERT((m_loader!=NULL),_T("no vocabulary loader.")); 00189 return m_loader->terms(); 00190 } 00191 00193 void clear(); 00194 00196 inline count_t distinctNumTerms(); 00197 private: 00198 builder_type* m_builder; 00199 loader_type* m_loader; 00200 }; 00201 00203 // 00204 template <typename TermT> 00205 CVocabularyLoader<TermT>::CVocabularyLoaderIterator::CVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms) 00206 :m_ptermtable(tbl) 00207 ,m_termCount(numTerms) 00208 ,m_curPos(-1) 00209 { 00210 } 00211 00212 template <typename TermT> 00213 CVocabularyLoader<TermT>::CVocabularyLoaderIterator::CVocabularyLoaderIterator(const CVocabularyLoaderIterator& clone) 00214 :m_ptermtable(clone.m_ptermtable) 00215 ,m_termCount(clone.m_termCount) 00216 ,m_curPos(clone.m_curPos) 00217 { 00218 } 00219 template <typename TermT> 00220 CVocabularyLoader<TermT>::CVocabularyLoaderIterator::~CVocabularyLoaderIterator() 00221 { 00222 } 00223 00224 template <typename TermT> 00225 bool CVocabularyLoader<TermT>::CVocabularyLoaderIterator::next() 00226 { 00227 m_curPos++; 00228 if(m_curPos < m_termCount) 00229 return true; 00230 return false; 00231 } 00232 template <typename TermT> 00233 bool CVocabularyLoader<TermT>::CVocabularyLoaderIterator::skipTo(TermT t) 00234 { 00235 int32_t k; 00236 int32_t start = 0,end = m_termCount - 1; 00237 int32_t nk = end; 00238 while (start<=end) 00239 { 00240 k = (start + end)/2; 00241 if(t == m_ptermtable[k].tid)//找到 00242 { 00243 nk = k; 00244 m_curPos = nk; 00245 return true; 00246 } 00247 if(t < m_ptermtable[k].tid)//查找左半边 00248 { 00249 end = k - 1; 00250 if(k >= start) 00251 { 00252 nk =k; 00253 } 00254 } 00255 else //查找右半边 00256 { 00257 start = k + 1; 00258 if(start <= end) 00259 { 00260 if(m_ptermtable[start].tid > t) 00261 { 00262 nk = start; 00263 } 00264 } 00265 } 00266 } 00267 if(m_ptermtable[nk].tid < t) 00268 { 00269 m_curPos = -1; 00270 return false; 00271 } 00272 m_curPos = nk; 00273 return true; 00274 } 00275 template <typename TermT> 00276 TermT CVocabularyLoader<TermT>::CVocabularyLoaderIterator::term() 00277 { 00278 return m_ptermtable[m_curPos].tid; 00279 } 00280 template <typename TermT> 00281 CTermInfo* CVocabularyLoader<TermT>::CVocabularyLoaderIterator::second() 00282 { 00283 return m_ptermtable[m_curPos].ti; 00284 } 00285 00286 template <typename TermT> 00287 CVocabularyLoader<TermT>::CVocabularyLoader() 00288 { 00289 } 00290 template <typename TermT> 00291 CVocabularyLoader<TermT>::~CVocabularyLoader() 00292 { 00293 00294 } 00295 00296 template <typename TermT> 00297 void CVocabularyLoader<TermT>::load(CIndexInput* pIndexInput) 00298 { 00299 m_termCount = pIndexInput->readInt(); 00300 m_ptermtable = new term_table_type[m_termCount]; 00301 freq_t df; 00302 fileoffset_t dfiP,ptiP; 00303 for (int32_t i = 0;i < m_termCount;i++) 00304 { 00305 m_ptermtable[i].tid = pIndexInput->readVInt(); 00306 df = pIndexInput->readVInt(); 00307 dfiP = pIndexInput->readVLong(); 00308 ptiP = pIndexInput->readVLong(); 00309 m_ptermtable[i].ti.set(df,dfiP,ptiP); 00310 } 00311 } 00312 template <typename TermT> 00313 CTermInfo* CVocabularyLoader<TermT>::find(TermT t) 00314 { 00315 int32_t start = 0,end = m_termCount-1; 00316 int32_t mid = (start + end)/2; 00317 while (start <= end) 00318 { 00319 mid = (start + end)/2; 00320 if(m_ptermtable[mid].tid == t) 00321 { 00322 return &(m_ptermtable[mid].ti); 00323 } 00324 if(m_ptermtable[mid].tid > t) 00325 { 00326 end = mid - 1; 00327 } 00328 else 00329 { 00330 start = mid + 1; 00331 } 00332 } 00333 return NULL; 00334 } 00335 00336 template <typename TermT> 00337 void CVocabularyLoader<TermT>::clear() 00338 { 00339 if(m_ptermtable) 00340 { 00341 delete[] m_ptermtable; 00342 m_ptermtable = NULL; 00343 } 00344 m_termCount = 0; 00345 } 00346 template <typename TermT> 00347 typename CVocabularyLoader<TermT>::loader_iterator CVocabularyLoader<TermT>::terms() 00348 { 00349 return typename CVocabularyLoader<TermT>::loader_iterator(m_ptermtable,m_termCount);; 00350 } 00352 // 00353 template <typename TermT> 00354 CHashMapBuilder<TermT>::CHashMapBuilder() 00355 :m_memcache(NULL) 00356 { 00357 00358 } 00359 template <typename TermT> 00360 CHashMapBuilder<TermT>::CHashMapBuilder(CPosMemCache* mc) 00361 :m_memcache(mc) 00362 { 00363 00364 } 00365 template <typename TermT> 00366 CHashMapBuilder<TermT>::~CHashMapBuilder() 00367 { 00368 clear(); 00369 } 00370 00371 template <typename TermT> 00372 void CHashMapBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc) 00373 { 00374 CIndexOutput* tdiWriter = pOutputDesc->tdiWriter; 00375 CIndexOutput* dfiWriter = pOutputDesc->dfiWriter; 00376 CIndexOutput* ptiWriter = pOutputDesc->ptiWriter; 00377 00378 if(m_table.size() <= 0) 00379 return; 00380 00381 CPosting* pPost = NULL; 00382 00383 table_array* lex = new table_array[m_table.size()]; 00384 int32_t len = 0; 00385 table_iterator iter = m_table.begin(); 00386 while (iter != m_table.end()) 00387 { 00388 pPost = iter->second; 00389 if(!pPost->hasNoMem()) 00390 { 00391 lex[len].tid = iter->first; 00392 lex[len].posting = pPost; 00393 len ++; 00394 } 00395 iter ++; 00396 } 00397 00398 if(len == 0) 00399 return ; 00400 quickSort(lex,0,len-1);//快速排序 00401 00402 tdiWriter->writeInt(len);//词典长度 00403 for (int32_t i = 0;i < len;i++) 00404 { 00405 tdiWriter->writeVInt(lex[i].tid); //write termid 00406 pPost = lex[i].posting; 00407 00408 tdiWriter->writeVInt(pPost->docFreq()); //write df 00409 tdiWriter->writeVLong(dfiWriter->getFilePointer()); //write doc freq pos 00410 tdiWriter->writeVLong(ptiWriter->getFilePointer()); //write posting pos 00411 00412 pPost->writeDocFreq(dfiWriter); //write doc freq 00413 pPost->writePosition(ptiWriter); //write posting 00414 00415 pPost->reset(); //清空Posting 00416 } 00417 00418 delete[] lex; 00419 } 00420 template <typename TermT> 00421 inline CPosting* CHashMapBuilder<TermT>::find(term_type t) 00422 { 00423 table_iterator iter = m_table.find(t); 00424 if(iter == m_table.end()) 00425 { 00426 CPosting* p = new CPosting(m_memcache); 00427 m_table.insert(table_item_pair(t,p)); 00428 return p; 00429 } 00430 return iter->second; 00431 } 00432 template <typename TermT> 00433 void CHashMapBuilder<TermT>::clear() 00434 { 00435 table_iterator iter = m_table.begin(); 00436 while (iter != m_table.end()) 00437 { 00438 delete iter->second; 00439 iter++; 00440 } 00441 m_table.clear(); 00442 } 00443 template <typename TermT> 00444 void CHashMapBuilder<TermT>::quickSort(table_array lex[], int32_t lo, int32_t hi) 00445 { 00446 if (lo >= hi) 00447 return; 00448 00449 int32_t mid = (lo + hi) / 2; 00450 table_array tmp; 00451 00452 if (lex[lo].tid > lex[mid].tid) 00453 { 00454 tmp = lex[lo]; 00455 lex[lo] = lex[mid]; 00456 lex[mid] = tmp; 00457 } 00458 00459 if (lex[mid].tid > lex[hi].tid) 00460 { 00461 tmp = lex[mid]; 00462 lex[mid] = lex[hi]; 00463 lex[hi] = tmp; 00464 00465 if (lex[lo].tid > lex[mid].tid) 00466 { 00467 tmp = lex[lo]; 00468 lex[lo] = lex[mid]; 00469 lex[mid] = tmp; 00470 } 00471 } 00472 00473 int32_t left = lo + 1; 00474 int32_t right = hi - 1; 00475 00476 if (left >= right) 00477 return; 00478 00479 term_type partition = lex[mid].tid; 00480 00481 for (; ;) 00482 { 00483 while (lex[right].tid > partition) 00484 --right; 00485 00486 while ( (left < right) && (lex[left].tid <= partition)) 00487 ++left; 00488 00489 if (left < right) 00490 { 00491 tmp = lex[left]; 00492 lex[left] = lex[right]; 00493 lex[right] = tmp; 00494 --right; 00495 } 00496 else 00497 { 00498 break; 00499 } 00500 } 00501 00502 quickSort(lex, lo, left); 00503 quickSort(lex, left + 1, hi); 00504 } 00505 00507 // 00509 //CDynamicArrayBuilder 00510 template <typename TermT> 00511 CDynamicArrayBuilder<TermT>::CDynamicArrayBuilder() 00512 :m_memcache(NULL) 00513 { 00514 } 00515 template <typename TermT> 00516 CDynamicArrayBuilder<TermT>::CDynamicArrayBuilder(CPosMemCache* mc) 00517 :m_memcache(mc) 00518 { 00519 00520 } 00521 template <typename TermT> 00522 CDynamicArrayBuilder<TermT>::~CDynamicArrayBuilder() 00523 { 00524 clear(); 00525 } 00526 00527 template <typename TermT> 00528 void CDynamicArrayBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc) 00529 { 00530 CIndexOutput* tdiWriter = pOutputDesc->tdiWriter; 00531 CIndexOutput* dfiWriter = pOutputDesc->dfiWriter; 00532 CIndexOutput* ptiWriter = pOutputDesc->ptiWriter; 00533 00534 int32_t len = 0; 00535 CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator it = m_array.elements(); 00536 while(it.next()) 00537 { 00538 00539 if(!(it.element()->hasNoMem())) 00540 { 00541 len++; 00542 } 00543 } 00544 if(len <= 0) 00545 return; 00546 tdiWriter->writeInt(len);//词典长度 00547 00548 CPosting* pPosting; 00549 CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator aiter = m_array.elements(); 00550 while(aiter.next()) 00551 { 00552 pPosting = aiter.element(); 00553 if(!pPosting->hasNoMem()) 00554 { 00555 tdiWriter->writeVInt((int32_t)aiter.position()); //write termid 00556 00557 tdiWriter->writeVInt(pPosting->docFreq()); //write df 00558 tdiWriter->writeVLong(dfiWriter->getFilePointer()); //write doc freq pos 00559 tdiWriter->writeVLong(ptiWriter->getFilePointer()); //write posting pos 00560 00561 pPosting->writeDocFreq(dfiWriter); //write doc freq 00562 pPosting->writePosition(ptiWriter); //write posting 00563 00564 pPosting->reset(); //清空Posting 00565 } 00566 } 00567 } 00568 template <typename TermT> 00569 inline CPosting* CDynamicArrayBuilder<TermT>::find(term_type t) 00570 { 00571 CPosting* p = m_array[t]; 00572 if(p == NULL) 00573 { 00574 p = new CPosting(m_memcache); 00575 m_array[t] = p; 00576 return p; 00577 } 00578 return p; 00579 } 00580 template <typename TermT> 00581 void CDynamicArrayBuilder<TermT>::clear() 00582 { 00583 CPosting* pPosting; 00584 CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator aiter = m_array.elements(); 00585 while(aiter.next()) 00586 { 00587 pPosting = aiter.element(); 00588 if(pPosting) 00589 delete pPosting; 00590 } 00591 m_array.clear(); 00592 } 00593 00595 // 00596 template<typename TermT,typename BuilderT,typename LoaderT> 00597 CVocabulary<TermT,BuilderT,LoaderT>::CVocabulary(CPosMemCache* mc) 00598 :m_builder(new BuilderT(mc)) 00599 ,m_loader(NULL) 00600 { 00601 } 00602 template<typename TermT,typename BuilderT,typename LoaderT> 00603 CVocabulary<TermT,BuilderT,LoaderT>::CVocabulary(void) 00604 :m_builder(NULL) 00605 ,m_loader(NULL) 00606 { 00607 } 00608 template<typename TermT,typename BuilderT,typename LoaderT> 00609 CVocabulary<TermT,BuilderT,LoaderT>::~CVocabulary(void) 00610 { 00611 if(m_builder) 00612 { 00613 delete m_builder; 00614 m_builder = NULL; 00615 } 00616 if(m_loader) 00617 { 00618 delete m_loader; 00619 m_loader = NULL; 00620 } 00621 } 00622 00623 template<typename TermT,typename BuilderT,typename LoaderT> 00624 inline BuilderT* CVocabulary<TermT,BuilderT,LoaderT>::getBuilder() 00625 { 00626 return m_builder; 00627 } 00628 template<typename TermT,typename BuilderT,typename LoaderT> 00629 inline LoaderT* CVocabulary<TermT,BuilderT,LoaderT>::getLoader() 00630 { 00631 return m_loader; 00632 } 00633 template<typename TermT,typename BuilderT,typename LoaderT> 00634 void CVocabulary<TermT,BuilderT,LoaderT>::load(CIndexInput* pIndexInput) 00635 { 00636 if(m_loader == NULL) 00637 m_loader = new LoaderT(); 00638 m_loader->load(pIndexInput); 00639 } 00640 00641 template<typename TermT,typename BuilderT,typename LoaderT> 00642 void CVocabulary<TermT,BuilderT,LoaderT>::save(CIndexOutputDescriptor* pOutputDesc) 00643 { 00644 FIRTEX_ASSERT((m_builder!=NULL),_T("no vocabulary builder.")); 00645 m_builder->save(pOutputDesc); 00646 } 00647 00648 template<typename TermT,typename BuilderT,typename LoaderT> 00649 void CVocabulary<TermT,BuilderT,LoaderT>::clear() 00650 { 00651 if(m_builder) 00652 m_builder->clear(); 00653 if(m_loader) 00654 m_loader->clear(); 00655 } 00656 00657 template<typename TermT,typename BuilderT,typename LoaderT> 00658 inline count_t CVocabulary<TermT,BuilderT,LoaderT>::distinctNumTerms() 00659 { 00660 if(m_loader) 00661 return m_loader->distinctNumTerms(); 00662 return 0; 00663 } 00664 } 00665 } 00666 00667 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex