FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

DateFieldIndexer.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : 郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn
00012 // Created      : 2006/6/10
00013 //
00014 
00015 #ifndef _DATEFIELDINDEXER_H
00016 #define _DATEFIELDINDEXER_H
00017 
00018 #include "../utility/StdHeader.h"
00019 #include "FieldIndexer.h"
00020 #include "Posting.h"
00021 #include "Vocabulary.h"
00022 
00023 using namespace std;
00024 using namespace firtex;
00025 
00026 #ifndef WIN32
00027         namespace __gnu_cxx
00028         {
00029                 template<> struct hash< int64_t  >
00030                 {
00031                         size_t operator()( int64_t __x ) const
00032                         {
00033                                 return __x;
00034                         }
00035                 };
00036         }
00037 #endif
00038 
00039 namespace firtex
00040 {
00041         namespace index
00042         {               
00043 
00044 
00045                 template <typename TermT>
00046                 class CDateVocabularyBuilder
00047                 {
00048                 public:
00049                         typedef TermT                                   term_type;
00050                 protected:
00051                         typedef struct _table_array
00052                         {
00053                                 term_type       tid;
00054                                 CPosting*       posting;
00055                         }table_array;
00056                 public:
00057                         CDateVocabularyBuilder();
00058                         CDateVocabularyBuilder(CPosMemCache* mc);
00059                         ~CDateVocabularyBuilder();
00060                 public:
00061                         void                    save(CIndexOutputDescriptor* pOutputDesc);
00062                         CPosting*               find(term_type t);                      
00063 
00064                         void                    clear();
00065                         count_t                 distinctNumTerms(){return (count_t)m_table.size();}
00066                 protected:                      
00067                         void quickSort(table_array lex[], int32_t lo, int32_t hi);
00068                 protected:                              
00069                         hash_map<term_type,CPosting*>   m_table;
00070                         CPosMemCache*                                   m_memcache;
00071 
00072                         typedef pair<term_type,CPosting*> table_item_pair;
00073                         typedef typename hash_map<term_type,CPosting*>::iterator table_iterator;                        
00074                 };
00075 
00076 
00078                 template <typename TermT>
00079                 class CDateVocabularyLoader
00080                 {
00081                 public:
00082                         typedef TermT                   term_type;
00083                 public:
00084                         class term_table
00085                         {
00086                         public:
00087                                 term_table(){}
00088                                 ~term_table(){}
00089                                 term_type               tid;
00090                                 CTermInfo               ti;
00091                         };
00092                         typedef term_table term_table_type;
00093                         typedef term_table* term_table_ptr;
00094 
00095                         class CDateVocabularyLoaderIterator
00096                         {
00097                         public:
00098                                 CDateVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms);
00099                                 CDateVocabularyLoaderIterator(const CDateVocabularyLoaderIterator& clone);
00100                                 ~CDateVocabularyLoaderIterator();
00101 
00102                                 bool            next();
00103                                 bool            skipTo(term_type t);
00104                                 term_type       term();
00105                                 CTermInfo*      second();
00106                         protected:
00107                                 count_t                 m_termCount;
00108                                 term_table_ptr  m_ptermtable;
00109                                 int32_t                 m_curPos;
00110                         };
00111                         typedef CDateVocabularyLoaderIterator loader_iterator;
00112                 public:
00113                         CDateVocabularyLoader();
00114                         ~CDateVocabularyLoader();
00115                 public:
00117                         void            load(CIndexInput* pIndexInput);
00124                         CTermInfo*      find(term_type t);
00125 
00129                         loader_iterator terms();
00130 
00132                         void            clear();                                
00133 
00135                         count_t         distinctNumTerms(){return m_termCount;}
00136                 protected:              
00137                         count_t                 m_termCount;
00138                         term_table_ptr  m_ptermtable;
00139                 };
00140 
00141                 typedef CVocabulary<int64_t,CDateVocabularyBuilder<int64_t>,CDateVocabularyLoader<int64_t> > CDateVocabulary;
00142 
00143                 class CDateFieldIndexer :       public CFieldIndexer
00144                 {
00145                 public:
00146                         CDateFieldIndexer(CPosMemCache* pMemCache);
00147                         virtual ~CDateFieldIndexer(void);
00148                 public:         
00154                         void                    addField(docid_t did,CField* pField);
00155 
00159                         void                    write(CIndexOutputDescriptor* pWriterDesc);
00160 
00164                         void                    setFilePointer(fileoffset_t off){m_tdiFilePointer = off;};
00165                         fileoffset_t    getFilePointer(){return m_tdiFilePointer;};
00166 
00170                         uint64_t                distinctNumTerms();
00171                 protected:                      
00172                         CDateVocabulary*                m_pDictionary;                  
00173 
00174                         CPosMemCache*                   m_pMemCache;                    //内存缓冲
00175                         fileoffset_t                    m_tdiFilePointer;            
00176                 };
00177 
00180                 //
00181                 template <typename TermT>
00182                         CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::CDateVocabularyLoaderIterator(term_table_ptr tbl,count_t numTerms)
00183                         :m_ptermtable(tbl)
00184                         ,m_termCount(numTerms)
00185                         ,m_curPos(-1)
00186                 {
00187                 }
00188 
00189                 template <typename TermT>
00190                         CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::CDateVocabularyLoaderIterator(const CDateVocabularyLoaderIterator& clone)
00191                         :m_ptermtable(clone.m_ptermtable)
00192                         ,m_termCount(clone.m_termCount)
00193                         ,m_curPos(clone.m_curPos)
00194                 {
00195                 }
00196                 template <typename TermT>
00197                         CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::~CDateVocabularyLoaderIterator()
00198                 {
00199                 }
00200 
00201                 template <typename TermT>
00202                         bool CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::next()
00203                 {
00204                         m_curPos++;
00205                         if(m_curPos < m_termCount)
00206                                 return true;
00207                         return false;
00208                 }
00209                 template <typename TermT>
00210                         bool CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::skipTo(TermT t)
00211                 {
00212                         int32_t k;
00213                         int32_t start = 0,end = m_termCount - 1;
00214                         int32_t nk = end;
00215                         while (start<=end)
00216                         {
00217                                 k = (start + end)/2;
00218                                 if(t == m_ptermtable[k].tid)//找到
00219                                 {
00220                                         nk = k;
00221                                         m_curPos = nk;
00222                                         return true;
00223                                 }
00224                                 if(t < m_ptermtable[k].tid)//查找左半边
00225                                 {
00226                                         end = k - 1;
00227                                         if(k >= start)
00228                                         {                                               
00229                                                 nk =k;
00230                                         }
00231                                 }
00232                                 else //查找右半边
00233                                 {
00234                                         start = k + 1;                                  
00235                                         if(start <= end)
00236                                         {
00237                                                 if(m_ptermtable[start].tid > t)
00238                                                 {                                                       
00239                                                         nk = start;
00240                                                 }
00241                                         }
00242                                 }
00243                         }
00244                         if(m_ptermtable[nk].tid < t)
00245                         {
00246                                 m_curPos = -1;
00247                                 return false;
00248                         }
00249                         m_curPos = nk;
00250                         return true;
00251                 }
00252                 template <typename TermT>
00253                         TermT CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::term()
00254                 {
00255                         return m_ptermtable[m_curPos].tid;
00256                 }
00257                 template <typename TermT>
00258                         CTermInfo* CDateVocabularyLoader<TermT>::CDateVocabularyLoaderIterator::second()
00259                 {
00260                         return &(m_ptermtable[m_curPos].ti);
00261                 }                               
00262 
00263                 template <typename TermT>
00264                         CDateVocabularyLoader<TermT>::CDateVocabularyLoader()
00265                 {
00266                 }
00267                 template <typename TermT>
00268                         CDateVocabularyLoader<TermT>::~CDateVocabularyLoader()
00269                 {
00270 
00271                 }
00272 
00273                 template <typename TermT>
00274                         void CDateVocabularyLoader<TermT>::load(CIndexInput* pIndexInput)
00275                 {
00276                         m_termCount = pIndexInput->readInt();
00277                         m_ptermtable = new term_table_type[m_termCount];
00278                         freq_t  df;
00279                         fileoffset_t    dfiP;
00280                         for (int32_t i = 0;i < m_termCount;i++)
00281                         {
00282                                 m_ptermtable[i].tid = pIndexInput->readVInt();
00283                                 df = pIndexInput->readVInt();
00284                                 dfiP = pIndexInput->readVLong();
00285                                 m_ptermtable[i].ti.set(df,dfiP,0);
00286                         }
00287                 }
00288                 template <typename TermT>
00289                         CTermInfo* CDateVocabularyLoader<TermT>::find(TermT t)
00290                 {
00291                         int32_t start = 0,end = m_termCount-1;
00292                         int32_t mid = (start + end)/2;
00293                         while (start <= end)
00294                         {
00295                                 mid = (start + end)/2;
00296                                 if(m_ptermtable[mid].tid == t)
00297                                 {                                       
00298                                         return &(m_ptermtable[mid].ti);
00299                                 }
00300                                 if(m_ptermtable[mid].tid > t)
00301                                 {
00302                                         end = mid - 1;                                  
00303                                 }
00304                                 else
00305                                 {
00306                                         start = mid + 1;                                                                        
00307                                 }                               
00308                         }
00309                         return NULL;
00310                 }               
00311 
00312                 template <typename TermT>
00313                         void CDateVocabularyLoader<TermT>::clear()
00314                 {
00315                         if(m_ptermtable)
00316                         {
00317                                 delete[] m_ptermtable;
00318                                 m_ptermtable = NULL;
00319                         }                               
00320                         m_termCount = 0;
00321                 }
00322                 template <typename TermT>
00323                         typename CDateVocabularyLoader<TermT>::loader_iterator  CDateVocabularyLoader<TermT>::terms()
00324                 {                       
00325                         return typename CDateVocabularyLoader<TermT>::loader_iterator(m_ptermtable,m_termCount);
00326                 }
00328                 //
00329                 template <typename TermT>
00330                         CDateVocabularyBuilder<TermT>::CDateVocabularyBuilder()
00331                         :m_memcache(NULL)
00332                 {
00333 
00334                 }
00335                 template <typename TermT>
00336                         CDateVocabularyBuilder<TermT>::CDateVocabularyBuilder(CPosMemCache* mc)
00337                         :m_memcache(mc)
00338                 {
00339                 }
00340                 template <typename TermT>
00341                         CDateVocabularyBuilder<TermT>::~CDateVocabularyBuilder()
00342                 {
00343 
00344                 }
00345 
00346                 template <typename TermT>
00347                         void CDateVocabularyBuilder<TermT>::save(CIndexOutputDescriptor* pOutputDesc)
00348                 {
00349                         CIndexOutput* tdiWriter = pOutputDesc->tdiWriter;
00350                         CIndexOutput* dfiWriter = pOutputDesc->dfiWriter;
00351 
00352                         if(m_table.size() <= 0)
00353                                 return;                 
00354 
00355                         CPosting* pPost = NULL;
00356 
00357                         table_array* lex = new table_array[m_table.size()];
00358                         int32_t len = 0;                        
00359                         table_iterator iter = m_table.begin();                  
00360                         while (iter != m_table.end())
00361                         {                               
00362                                 pPost = iter->second;
00363                                 if(!pPost->hasNoMem())
00364                                 {
00365                                         lex[len].tid = iter->first;
00366                                         lex[len].posting = pPost;
00367                                         len ++;                                 
00368                                 }                               
00369                                 iter ++;
00370                         }                                       
00371 
00372                         quickSort(lex,0,len-1);//快速排序               
00373 
00374                         tdiWriter->writeInt(len);//词典长度                             
00375                         for (int32_t i = 0;i < len;i++)
00376                         {
00377                                 tdiWriter->writeVLong(lex[i].tid);                                      //write termid
00378                                 pPost = lex[i].posting;
00379 
00380                                 tdiWriter->writeVInt(pPost->docFreq());                         //write df
00381                                 tdiWriter->writeVLong(dfiWriter->getFilePointer());     //write doc freq pos
00382 
00383                                 pPost->writeDocFreq(dfiWriter);                                         //write doc freq
00384 
00385                                 pPost->reset();                                                                         //清空Posting
00386                         }
00387 
00388                         delete[] lex;
00389                 }
00390                 template <typename TermT>
00391                         inline CPosting* CDateVocabularyBuilder<TermT>::find(term_type t)
00392                 {
00393                         table_iterator iter = m_table.find(t);
00394                         if(iter == m_table.end())
00395                         {
00396                                 CPosting* p = new CPosting(m_memcache);
00397                                 m_table.insert(table_item_pair(t,p));
00398                                 return p;
00399                         }
00400                         return iter->second;
00401                 }
00402                 template <typename TermT>
00403                         void CDateVocabularyBuilder<TermT>::clear()
00404                 {
00405                         table_iterator iter = m_table.begin();
00406                         while (iter != m_table.end())
00407                         {
00408                                 delete iter->second;
00409                                 iter++;
00410                         }
00411                         m_table.clear();
00412                 }
00413                 template <typename TermT>
00414                         void CDateVocabularyBuilder<TermT>::quickSort(table_array lex[], int32_t lo, int32_t hi)
00415                 {
00416                         if (lo >= hi)
00417                                 return;
00418 
00419                         int32_t mid = (lo + hi) / 2;
00420                         table_array tmp;
00421 
00422                         if (lex[lo].tid > lex[mid].tid) 
00423                         {
00424                                 tmp = lex[lo];
00425                                 lex[lo] = lex[mid];
00426                                 lex[mid] = tmp;
00427                         }
00428 
00429                         if (lex[mid].tid > lex[hi].tid)
00430                         {
00431                                 tmp = lex[mid];
00432                                 lex[mid] = lex[hi];
00433                                 lex[hi] = tmp;
00434 
00435                                 if (lex[lo].tid > lex[mid].tid)
00436                                 {
00437                                         tmp = lex[lo];
00438                                         lex[lo] = lex[mid];
00439                                         lex[mid] = tmp;
00440                                 }
00441                         }
00442 
00443                         int32_t left = lo + 1;
00444                         int32_t right = hi - 1;
00445 
00446                         if (left >= right)
00447                                 return;
00448 
00449                         term_type partition = lex[mid].tid;
00450 
00451                         for (; ;) 
00452                         {
00453                                 while (lex[right].tid > partition)
00454                                         --right;
00455 
00456                                 while ( (left < right) && (lex[left].tid <= partition))
00457                                         ++left;
00458 
00459                                 if (left < right) 
00460                                 {
00461                                         tmp = lex[left];
00462                                         lex[left] = lex[right];
00463                                         lex[right] = tmp;
00464                                         --right;
00465                                 }
00466                                 else 
00467                                 {
00468                                         break;
00469                                 }
00470                         }
00471 
00472                         quickSort(lex, lo, left);
00473                         quickSort(lex, left + 1, hi);
00474                 }
00475         }
00476 }
00477 
00478 
00479 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex