FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

Vocabulary.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : 郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn
00012 // Created      : 2006/7/17
00013 //
00014 #ifndef _VOCABULARY_H
00015 #define _VOCABULARY_H
00016 
00017 #if _MSC_VER > 1000
00018 #pragma once
00019 #endif // _MSC_VER > 1000
00020 
00021 #include "../utility/Config.h"
00022 #include "../store/IndexInput.h"
00023 #include "../store/IndexOutput.h"
00024 #include "FieldIndexer.h"
00025 #include "Posting.h"
00026 #include "TermInfo.h"
00027 #include "../utility/DynamicArray.h"
00028 
00029 #ifdef WIN32
00030         #include <hash_map>
00031         using namespace stdext;
00032 #else
00033         #include <ext/hash_map>
00034         using namespace __gnu_cxx;
00035 #endif
00036 
00037 using namespace firtex::store;
00038 using namespace firtex::utility;
00039 
00040 
00041 namespace firtex
00042 {
00043         namespace index
00044         {               
00046                 template <typename TermT>
00047                 class CHashMapBuilder
00048                 {
00049                 public:
00050                         typedef TermT                                   term_type;
00051                 protected:
00052                         typedef struct _table_array
00053                         {
00054                                 term_type               tid;
00055                                 CPosting*               posting;
00056                         }table_array;
00057                 public:
00058                         CHashMapBuilder();
00059                         CHashMapBuilder(CPosMemCache* mc);
00060                         ~CHashMapBuilder();
00061                 public:
00062                         void                    save(CIndexOutputDescriptor* pOutputDesc);
00063                         CPosting*               find(term_type t);                      
00064                         
00065                         void                    clear();
00066                         count_t                 distinctNumTerms(){return (count_t)m_table.size();}
00067                 protected:                      
00068                         void quickSort(table_array lex[], int32_t lo, int32_t hi);
00069                 protected:                              
00070                         hash_map<term_type,CPosting*>   m_table;
00071                         CPosMemCache*                                   m_memcache;
00072 
00073                         typedef pair<term_type,CPosting*> table_item_pair;
00074                         typedef typename hash_map<term_type,CPosting*>::iterator table_iterator;                        
00075                 };
00076                 
00077                 template <typename TermT>
00078                 class CDynamicArrayBuilder
00079                 {
00080                 public:
00081                         typedef TermT                                   term_type;
00082                 protected:                      
00083                 public:
00084                         CDynamicArrayBuilder();
00085                         CDynamicArrayBuilder(CPosMemCache* mc);
00086                         ~CDynamicArrayBuilder();
00087                 public:
00088                         void                    save(CIndexOutputDescriptor* pOutputDesc);
00089                         CPosting*               find(term_type t);                      
00090 
00091                         void                    clear();
00092                         count_t                 distinctNumTerms(){return (count_t)m_array.length();}
00093                 protected:                              
00094                         CDynamicArray<CPosting*,Const_NullValue<CPosting*> >    m_array;
00095                         CPosMemCache*                                   m_memcache;
00096                 };
00097 
00098 
00100                 template <typename TermT>
00101                 class CVocabularyLoader
00102                 {
00103                 public:
00104                         typedef TermT                   term_type;
00105                 public:
00106                         class term_table
00107                         {
00108                         public:
00109                                 term_table(){}
00110                                 ~term_table(){}
00111                                 term_type               tid;
00112                                 CTermInfo               ti;
00113                         };
00114                         typedef term_table term_table_type;
00115                         typedef term_table* term_table_ptr;
00116 
00117                         class CVocabularyLoaderIterator
00118                         {
00119                         public:
00120                                 CVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms);
00121                                 CVocabularyLoaderIterator(const CVocabularyLoaderIterator& clone);
00122                                 ~CVocabularyLoaderIterator();
00123 
00124                                 bool            next();
00125                                 bool            skipTo(term_type t);
00126                                 term_type       term();
00127                                 CTermInfo*      second();
00128                         protected:
00129                                 count_t                 m_termCount;
00130                                 term_table_ptr  m_ptermtable;
00131                                 int32_t                 m_curPos;
00132                         };
00133                         typedef CVocabularyLoaderIterator loader_iterator;
00134                 public:
00135                         CVocabularyLoader();
00136                         ~CVocabularyLoader();
00137                 public:
00139                         void            load(CIndexInput* pIndexInput);
00146                         CTermInfo*      find(term_type t);
00147 
00151                         loader_iterator terms();
00152 
00154                         void            clear();                                
00155 
00157                         count_t         distinctNumTerms(){return m_termCount;}
00158                 protected:              
00159                         count_t                 m_termCount;
00160                         term_table_ptr  m_ptermtable;
00161                 };
00162 
00164                 template<typename TermT,typename BuilderT=CHashMapBuilder<TermT> ,typename LoaderT = CVocabularyLoader<TermT> >
00165                 class CVocabulary
00166                 {
00167                 public:
00168                         typedef BuilderT        builder_type;
00169                         typedef LoaderT         loader_type;
00170                         typedef typename LoaderT::loader_iterator       vacabulary_iterator;
00171                 public:
00172                         CVocabulary(CPosMemCache* mc);
00173                         CVocabulary(void);
00174                         ~CVocabulary(void);
00175                 public:
00176                         inline BuilderT*        getBuilder();
00177                         inline LoaderT*         getLoader();
00178 
00180                         void                    load(CIndexInput* pIndexInput);
00181 
00183                         void                    save(CIndexOutputDescriptor* pOutputDesc);
00184 
00186                         vacabulary_iterator     terms()
00187                         {
00188                                 FIRTEX_ASSERT((m_loader!=NULL),_T("no vocabulary loader."));
00189                                 return m_loader->terms();
00190                         }
00191 
00193                         void                    clear();
00194 
00196                         inline count_t  distinctNumTerms();
00197                 private:
00198                         builder_type*   m_builder;
00199                         loader_type*    m_loader;
00200                 };
00201 
00203                 //
00204                 template <typename TermT>
00205                         CVocabularyLoader<TermT>::CVocabularyLoaderIterator::CVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms)
00206                         :m_ptermtable(tbl)
00207                         ,m_termCount(numTerms)
00208                         ,m_curPos(-1)
00209                 {
00210                 }
00211 
00212                 template <typename TermT>
00213                         CVocabularyLoader<TermT>::CVocabularyLoaderIterator::CVocabularyLoaderIterator(const CVocabularyLoaderIterator& clone)
00214                         :m_ptermtable(clone.m_ptermtable)
00215                         ,m_termCount(clone.m_termCount)
00216                         ,m_curPos(clone.m_curPos)
00217                 {
00218                 }
00219                 template <typename TermT>
00220                         CVocabularyLoader<TermT>::CVocabularyLoaderIterator::~CVocabularyLoaderIterator()
00221                 {
00222                 }
00223 
00224                 template <typename TermT>
00225                         bool CVocabularyLoader<TermT>::CVocabularyLoaderIterator::next()
00226                 {
00227                         m_curPos++;
00228                         if(m_curPos < m_termCount)
00229                                 return true;
00230                         return false;
00231                 }
00232                 template <typename TermT>
00233                         bool CVocabularyLoader<TermT>::CVocabularyLoaderIterator::skipTo(TermT t)
00234                 {
00235                         int32_t k;
00236                         int32_t start = 0,end = m_termCount - 1;
00237                         int32_t nk = end;
00238                         while (start<=end)
00239                         {
00240                                 k = (start + end)/2;
00241                                 if(t == m_ptermtable[k].tid)//找到
00242                                 {
00243                                         nk = k;
00244                                         m_curPos = nk;
00245                                         return true;
00246                                 }
00247                                 if(t < m_ptermtable[k].tid)//查找左半边
00248                                 {
00249                                         end = k - 1;
00250                                         if(k >= start)
00251                                         {                                               
00252                                                 nk =k;
00253                                         }
00254                                 }
00255                                 else //查找右半边
00256                                 {
00257                                         start = k + 1;                                  
00258                                         if(start <= end)
00259                                         {
00260                                                 if(m_ptermtable[start].tid > t)
00261                                                 {                                                       
00262                                                         nk = start;
00263                                                 }
00264                                         }
00265                                 }
00266                         }
00267                         if(m_ptermtable[nk].tid < t)
00268                         {
00269                                 m_curPos = -1;
00270                                 return false;
00271                         }
00272                         m_curPos = nk;
00273                         return true;
00274                 }
00275                 template <typename TermT>
00276                         TermT CVocabularyLoader<TermT>::CVocabularyLoaderIterator::term()
00277                 {
00278             return m_ptermtable[m_curPos].tid;
00279                 }
00280                 template <typename TermT>
00281                         CTermInfo* CVocabularyLoader<TermT>::CVocabularyLoaderIterator::second()
00282                 {
00283                         return m_ptermtable[m_curPos].ti;
00284                 }                               
00285         
00286                 template <typename TermT>
00287                         CVocabularyLoader<TermT>::CVocabularyLoader()
00288                 {
00289                 }
00290                 template <typename TermT>
00291                         CVocabularyLoader<TermT>::~CVocabularyLoader()
00292                 {
00293 
00294                 }
00295 
00296                 template <typename TermT>
00297                         void CVocabularyLoader<TermT>::load(CIndexInput* pIndexInput)
00298                 {
00299                         m_termCount = pIndexInput->readInt();
00300                         m_ptermtable = new term_table_type[m_termCount];
00301                         freq_t  df;
00302                         fileoffset_t    dfiP,ptiP;
00303                         for (int32_t i = 0;i < m_termCount;i++)
00304                         {
00305                                 m_ptermtable[i].tid = pIndexInput->readVInt();
00306                                 df = pIndexInput->readVInt();
00307                                 dfiP = pIndexInput->readVLong();                                
00308                                 ptiP = pIndexInput->readVLong();                                
00309                                 m_ptermtable[i].ti.set(df,dfiP,ptiP);
00310                         }
00311                 }
00312                 template <typename TermT>
00313                         CTermInfo* CVocabularyLoader<TermT>::find(TermT t)
00314                 {
00315                         int32_t start = 0,end = m_termCount-1;
00316                         int32_t mid = (start + end)/2;
00317                         while (start <= end)
00318                         {
00319                                 mid = (start + end)/2;
00320                                 if(m_ptermtable[mid].tid == t)
00321                                 {                                       
00322                                         return &(m_ptermtable[mid].ti);
00323                                 }
00324                                 if(m_ptermtable[mid].tid > t)
00325                                 {
00326                                         end = mid - 1;                                  
00327                                 }
00328                                 else
00329                                 {
00330                                         start = mid + 1;                                                                        
00331                                 }                               
00332                         }
00333                         return NULL;
00334                 }               
00335 
00336                 template <typename TermT>
00337                         void CVocabularyLoader<TermT>::clear()
00338                 {
00339                         if(m_ptermtable)
00340                         {
00341                                 delete[] m_ptermtable;
00342                                 m_ptermtable = NULL;
00343                         }                               
00344                         m_termCount = 0;
00345                 }
00346                 template <typename TermT>
00347                         typename CVocabularyLoader<TermT>::loader_iterator      CVocabularyLoader<TermT>::terms()
00348                 {                       
00349                         return typename CVocabularyLoader<TermT>::loader_iterator(m_ptermtable,m_termCount);;
00350                 }
00352                 //
00353                 template <typename TermT>
00354                         CHashMapBuilder<TermT>::CHashMapBuilder()
00355                         :m_memcache(NULL)
00356                 {
00357 
00358                 }
00359                 template <typename TermT>
00360                         CHashMapBuilder<TermT>::CHashMapBuilder(CPosMemCache* mc)
00361                         :m_memcache(mc)
00362                 {
00363 
00364                 }
00365                 template <typename TermT>
00366                         CHashMapBuilder<TermT>::~CHashMapBuilder()
00367                 {
00368                         clear();
00369                 }
00370                 
00371                 template <typename TermT>
00372                         void CHashMapBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc)
00373                 {
00374                         CIndexOutput* tdiWriter = pOutputDesc->tdiWriter;
00375                         CIndexOutput* dfiWriter = pOutputDesc->dfiWriter;
00376                         CIndexOutput* ptiWriter = pOutputDesc->ptiWriter;
00377 
00378                         if(m_table.size() <= 0)
00379                                 return;                 
00380 
00381                         CPosting* pPost = NULL;
00382 
00383                         table_array* lex = new table_array[m_table.size()];
00384                         int32_t len = 0;                        
00385                         table_iterator iter = m_table.begin();                  
00386                         while (iter != m_table.end())
00387                         {                               
00388                                 pPost = iter->second;
00389                                 if(!pPost->hasNoMem())
00390                                 {
00391                                         lex[len].tid = iter->first;
00392                                         lex[len].posting = pPost;
00393                                         len ++;                                 
00394                                 }                               
00395                                 iter ++;
00396                         }                                       
00397 
00398                         if(len == 0)
00399                                 return ;
00400                         quickSort(lex,0,len-1);//快速排序               
00401 
00402                         tdiWriter->writeInt(len);//词典长度                             
00403                         for (int32_t i = 0;i < len;i++)
00404                         {
00405                                 tdiWriter->writeVInt(lex[i].tid);                                       //write termid
00406                                 pPost = lex[i].posting;
00407 
00408                                 tdiWriter->writeVInt(pPost->docFreq());                         //write df
00409                                 tdiWriter->writeVLong(dfiWriter->getFilePointer());     //write doc freq pos
00410                                 tdiWriter->writeVLong(ptiWriter->getFilePointer());     //write posting pos
00411 
00412                                 pPost->writeDocFreq(dfiWriter);                                         //write doc freq
00413                                 pPost->writePosition(ptiWriter);                                        //write posting
00414 
00415                                 pPost->reset();                                                                         //清空Posting
00416                         }
00417 
00418                         delete[] lex;
00419                 }
00420                 template <typename TermT>
00421                         inline CPosting* CHashMapBuilder<TermT>::find(term_type t)
00422                 {
00423                         table_iterator iter = m_table.find(t);
00424                         if(iter == m_table.end())
00425                         {
00426                                 CPosting* p = new CPosting(m_memcache);
00427                                 m_table.insert(table_item_pair(t,p));
00428                                 return p;
00429                         }
00430                         return iter->second;
00431                 }
00432                 template <typename TermT>
00433                         void CHashMapBuilder<TermT>::clear()
00434                 {
00435                         table_iterator iter = m_table.begin();
00436                         while (iter != m_table.end())
00437                         {
00438                                 delete iter->second;
00439                                 iter++;
00440                         }
00441                         m_table.clear();
00442                 }
00443                 template <typename TermT>
00444                         void CHashMapBuilder<TermT>::quickSort(table_array lex[], int32_t lo, int32_t hi)
00445                 {
00446                         if (lo >= hi)
00447                                 return;
00448 
00449                         int32_t mid = (lo + hi) / 2;
00450                         table_array tmp;
00451 
00452                         if (lex[lo].tid > lex[mid].tid) 
00453                         {
00454                                 tmp = lex[lo];
00455                                 lex[lo] = lex[mid];
00456                                 lex[mid] = tmp;
00457                         }
00458 
00459                         if (lex[mid].tid > lex[hi].tid)
00460                         {
00461                                 tmp = lex[mid];
00462                                 lex[mid] = lex[hi];
00463                                 lex[hi] = tmp;
00464 
00465                                 if (lex[lo].tid > lex[mid].tid)
00466                                 {
00467                                         tmp = lex[lo];
00468                                         lex[lo] = lex[mid];
00469                                         lex[mid] = tmp;
00470                                 }
00471                         }
00472 
00473                         int32_t left = lo + 1;
00474                         int32_t right = hi - 1;
00475 
00476                         if (left >= right)
00477                                 return;
00478 
00479                         term_type partition = lex[mid].tid;
00480 
00481                         for (; ;) 
00482                         {
00483                                 while (lex[right].tid > partition)
00484                                         --right;
00485 
00486                                 while ( (left < right) && (lex[left].tid <= partition))
00487                                         ++left;
00488 
00489                                 if (left < right) 
00490                                 {
00491                                         tmp = lex[left];
00492                                         lex[left] = lex[right];
00493                                         lex[right] = tmp;
00494                                         --right;
00495                                 }
00496                                 else 
00497                                 {
00498                                         break;
00499                                 }
00500                         }
00501 
00502                         quickSort(lex, lo, left);
00503                         quickSort(lex, left + 1, hi);
00504                 }
00505 
00507                 //
00509                 //CDynamicArrayBuilder
00510                 template <typename TermT>
00511                         CDynamicArrayBuilder<TermT>::CDynamicArrayBuilder()
00512                         :m_memcache(NULL)
00513                 {
00514                 }
00515                 template <typename TermT>
00516                         CDynamicArrayBuilder<TermT>::CDynamicArrayBuilder(CPosMemCache* mc)
00517                         :m_memcache(mc)
00518                 {
00519 
00520                 }
00521                 template <typename TermT>
00522                         CDynamicArrayBuilder<TermT>::~CDynamicArrayBuilder()
00523                 {
00524                         clear();
00525                 }
00526 
00527                 template <typename TermT>
00528                         void CDynamicArrayBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc)
00529                 {
00530                         CIndexOutput* tdiWriter = pOutputDesc->tdiWriter;
00531                         CIndexOutput* dfiWriter = pOutputDesc->dfiWriter;
00532                         CIndexOutput* ptiWriter = pOutputDesc->ptiWriter;
00533 
00534                         int32_t len = 0;
00535                         CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator it = m_array.elements();                           
00536                         while(it.next())
00537                         {
00538                                 
00539                                 if(!(it.element()->hasNoMem()))
00540                                 {
00541                                         len++;
00542                                 }
00543                         }
00544                         if(len <= 0)
00545                                 return; 
00546                         tdiWriter->writeInt(len);//词典长度     
00547 
00548                         CPosting* pPosting;
00549                         CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator aiter = m_array.elements();                                
00550                         while(aiter.next())
00551                         {
00552                                 pPosting = aiter.element();
00553                                 if(!pPosting->hasNoMem())
00554                                 {
00555                                         tdiWriter->writeVInt((int32_t)aiter.position());                                //write termid
00556 
00557                                         tdiWriter->writeVInt(pPosting->docFreq());                      //write df
00558                                         tdiWriter->writeVLong(dfiWriter->getFilePointer());     //write doc freq pos
00559                                         tdiWriter->writeVLong(ptiWriter->getFilePointer());     //write posting pos
00560         
00561                                         pPosting->writeDocFreq(dfiWriter);                                      //write doc freq
00562                                         pPosting->writePosition(ptiWriter);                                     //write posting
00563 
00564                                         pPosting->reset();                                                                      //清空Posting
00565                                 }
00566                         }
00567                 }
00568                 template <typename TermT>
00569                         inline CPosting* CDynamicArrayBuilder<TermT>::find(term_type t)
00570                 {
00571                         CPosting* p = m_array[t];
00572                         if(p == NULL)
00573                         {
00574                                 p = new CPosting(m_memcache);
00575                                 m_array[t] = p;                         
00576                                 return p;
00577                         }                       
00578                         return p;
00579                 }
00580                 template <typename TermT>
00581                         void CDynamicArrayBuilder<TermT>::clear()
00582                 {
00583                         CPosting* pPosting;
00584                         CDynamicArray<CPosting*,Const_NullValue<CPosting*> >::array_iterator aiter = m_array.elements();                                
00585                         while(aiter.next())
00586                         {                               
00587                                 pPosting = aiter.element();
00588                                 if(pPosting)
00589                                         delete pPosting;                                
00590                         }
00591                         m_array.clear();
00592                 }
00593 
00595                 //
00596                 template<typename TermT,typename BuilderT,typename LoaderT>
00597                         CVocabulary<TermT,BuilderT,LoaderT>::CVocabulary(CPosMemCache* mc)
00598                         :m_builder(new BuilderT(mc))
00599                         ,m_loader(NULL)
00600                 {
00601                 }
00602                 template<typename TermT,typename BuilderT,typename LoaderT>
00603                         CVocabulary<TermT,BuilderT,LoaderT>::CVocabulary(void)
00604                         :m_builder(NULL)
00605                         ,m_loader(NULL)
00606                 {
00607                 }
00608                 template<typename TermT,typename BuilderT,typename LoaderT>
00609                         CVocabulary<TermT,BuilderT,LoaderT>::~CVocabulary(void)
00610                 {
00611                         if(m_builder)
00612                         {
00613                                 delete m_builder;
00614                                 m_builder = NULL;
00615                         }
00616                         if(m_loader)
00617                         {
00618                                 delete m_loader;
00619                                 m_loader = NULL;
00620                         }
00621                 }
00622 
00623                 template<typename TermT,typename BuilderT,typename LoaderT>
00624                         inline BuilderT* CVocabulary<TermT,BuilderT,LoaderT>::getBuilder()
00625                 {
00626                         return m_builder;
00627                 }
00628                 template<typename TermT,typename BuilderT,typename LoaderT>
00629                         inline LoaderT* CVocabulary<TermT,BuilderT,LoaderT>::getLoader()
00630                 {
00631                         return m_loader;
00632                 }
00633                 template<typename TermT,typename BuilderT,typename LoaderT>
00634                         void CVocabulary<TermT,BuilderT,LoaderT>::load(CIndexInput* pIndexInput)
00635                 {
00636                         if(m_loader == NULL)
00637                                 m_loader = new LoaderT();
00638                         m_loader->load(pIndexInput);
00639                 }
00640 
00641                 template<typename TermT,typename BuilderT,typename LoaderT>
00642                         void CVocabulary<TermT,BuilderT,LoaderT>::save(CIndexOutputDescriptor* pOutputDesc)
00643                 {
00644                         FIRTEX_ASSERT((m_builder!=NULL),_T("no vocabulary builder."));
00645                         m_builder->save(pOutputDesc);
00646                 }
00647 
00648                 template<typename TermT,typename BuilderT,typename LoaderT>
00649                         void CVocabulary<TermT,BuilderT,LoaderT>::clear()
00650                 {
00651                         if(m_builder)
00652                                 m_builder->clear();
00653                         if(m_loader)                    
00654                                 m_loader->clear();
00655                 }
00656 
00657                 template<typename TermT,typename BuilderT,typename LoaderT>
00658                         inline count_t CVocabulary<TermT,BuilderT,LoaderT>::distinctNumTerms()
00659                 {
00660                         if(m_loader)
00661                                 return m_loader->distinctNumTerms();
00662                         return 0;
00663                 }
00664         }
00665 }
00666 
00667 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex