FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

Posting.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : 郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn
00012 // Created      : 2005/11/20
00013 //
00014 #ifndef __POSTING_H
00015 #define __POSTING_H
00016 
00017 #include "../utility/StdHeader.h"
00018 #include "../store/IndexOutput.h"
00019 #include "../utility/MemCache.h"
00020 #include <vector>
00021 #include <fstream>
00022 
00023 
00024 using namespace std;
00025 using namespace firtex;
00026 using namespace firtex::store;
00027 
00028 #define DOCFREQ_DEFAULT         5 //doc freq posting 初始块大小
00029 #define POS_DEFAULT                     8 //pos posting初始块大小
00030 
00031 #define CACHE_GROWSIZE          262144//30000000//32768         //32K
00032 
00033 
00034 namespace firtex
00035 {
00036         namespace index
00037         {
00038 #pragma pack(push,1)
00039                 typedef struct _tagPOSTINGNODE
00040                 {
00041                         uint8_t*                        begin;
00042                         uint8_t*                        end;
00043                         size_t                          size;
00044                         _tagPOSTINGNODE*        next;
00045                 }POSTINGNODE;
00046 #pragma pack(pop)
00047 
00048                 class CPostingList
00049                 {
00050                 public:
00051                         CPostingList(int32_t nodesize)
00052                         {
00053                                 m_pPostHead = NULL;
00054                                 m_pPostTail = NULL;
00055                                 m_iNextAllocSize = nodesize;
00056                                 m_iDefaultAllocSize = nodesize;
00057                         }
00058                         ~CPostingList()
00059                         {
00060                         }
00061                 public:
00062                         bool    add(int32_t i)
00063                         {
00064                                 size_t left = m_pPostTail->size - (m_pPostTail->end - m_pPostTail->begin);
00065                                 if(left < 4)//最少要4个空闲
00066                                         return false;
00067 
00068                                 //变长存储int32_t
00069                                 while ((i & ~ 0x7F) != 0)
00070                                 {
00071                                         *(m_pPostTail->end) = ((uint8_t) ((i & 0x7f) | 0x80));
00072                                         m_pPostTail->end++;
00073                                         i = (int32_t) (((uint32_t) i) >> 7);
00074                                 }
00075                                 *(m_pPostTail->end) = ((uint8_t)i);
00076                                 m_pPostTail->end++;                             
00077                                 return true;
00078                         }
00079                         void    addNode(POSTINGNODE* pNode)
00080                         {
00081                                 if(m_pPostTail)
00082                                         m_pPostTail->next = pNode;
00083                                 m_pPostTail = pNode;
00084                                 if(!m_pPostHead)
00085                                         m_pPostHead = m_pPostTail;
00086                         }
00087                         void    reset()
00088                         {
00089                                 m_pPostHead = m_pPostTail = NULL;
00090                                 m_iNextAllocSize = m_iDefaultAllocSize; 
00091                         }
00092                 protected:
00093                         POSTINGNODE*                    m_pPostHead;
00094                         POSTINGNODE*                    m_pPostTail;
00095                         int32_t                                 m_iNextAllocSize;       
00096                         int32_t                                 m_iDefaultAllocSize;    
00097 
00098                         friend class CPosting;
00099                 };              
00100 
00101 
00102                 class CPosting
00103                 {
00104                 public:
00105                         typedef CPosMemCache    memcache_type;
00106                 public:         
00107                         CPosting(memcache_type* mc);
00108 
00109                         ~CPosting();
00110                 public:
00116                         void    addLocation(docid_t docid, loc_t location);     
00117 
00122                         void    addDocument(docid_t docid);     
00123 
00124                         //判断有没有分配内存
00125                         bool    hasNoMem();
00126                         bool    allocMem(CPostingList* posting);
00127                         bool    allocMem();
00128 
00129                         count_t docFreq()const{return m_df;};
00130                         docid_t lastDocID(){return m_prevID;}
00131                         
00132                         int32_t         writeDocFreq(CIndexOutput* dfiOutput);
00133                         int32_t         writePosition(CIndexOutput* ptiOutput);
00134 
00135                         void    reset();
00136 
00137                 protected:      
00138                         CPostingList*                   m_pDocfreqPosting;
00139                         CPostingList*                   m_pPosPosting;
00140                         
00141                         
00142                         memcache_type*                  m_pMemCache;
00143 
00144                         count_t                                 m_df;                   //文档频率
00145 
00146                         loc_t                                   m_freq;                 //当前term freq
00147 
00148                         loc_t                                   m_prevID;       
00149                         loc_t                                   m_prevPos;
00150 
00151                         int32_t                                 m_iMemSize;
00152                         static size_t                   m_iNextGrowSize;        
00153 
00154                         int32_t                                 m_LOC_Tsize;
00155                 };
00156 
00158                 //inline functions
00159                 inline bool CPosting::hasNoMem()
00160                 {
00161                         if(m_iMemSize == 0)
00162                                 return true;
00163                         return false;
00164                 }
00165                 inline void CPosting::addLocation(docid_t docid, loc_t location) 
00166                 {                       
00167                         //判断是否旧文档
00168                         if (docid == m_prevID) 
00169                         {               
00170                                 if(!m_pPosPosting->add(location-m_prevPos))                             
00171                                 {
00172                                         //没内存了
00173                                         allocMem(m_pPosPosting);                
00174                                         m_pPosPosting->add(location-m_prevPos);//差量编码
00175                                 }
00176 
00177                                 m_freq++;
00178                                 m_prevPos = location;                           
00179                         } 
00180                         else//新文档
00181                         {
00182                                 if(m_freq > 0)//写上一文档的term freq
00183                                 {
00184                                         if(!m_pDocfreqPosting->add(m_freq))
00185                                         {
00186                                                 //没内存了
00187                                                 allocMem(m_pDocfreqPosting);            
00188                                                 m_pDocfreqPosting->add(m_freq);
00189                                         }
00190                                 }               
00191                                 else 
00192                                 {
00193                                         //第一个文档
00194                                         if(m_prevID == -1)
00195                                                 m_prevID = 0;
00196                                 }
00197                                 if(!m_pDocfreqPosting->add(docid - m_prevID))
00198                                 {
00199                                         //没内存了
00200                                         allocMem(m_pDocfreqPosting);            
00201                                         m_pDocfreqPosting->add(docid - m_prevID);
00202                                 }
00203                                 if(!m_pPosPosting->add(location))
00204                                 {
00205                                         //没内存了
00206                                         allocMem(m_pPosPosting);                
00207                                         m_pPosPosting->add(location);
00208                                 }
00209 
00210                                 m_freq = 1;
00211 
00212                                 m_prevID = docid;
00213 
00214                                 //保存这个位置,方便后面差量编码
00215                                 m_prevPos = location;
00216 
00217                                 m_df++;
00218                         }
00219                 }
00220 
00221                 inline void CPosting::addDocument(docid_t docid) 
00222                 {                       
00223                         //判断是否旧文档
00224                         if (docid == m_prevID) 
00225                         {
00226                                 m_freq++;                       
00227                         } 
00228                         else//新文档
00229                         {
00230                                 if(m_freq > 0)//写上一文档的term freq
00231                                 {
00232                                         if(!m_pDocfreqPosting->add(m_freq))
00233                                         {
00234                                                 //没内存了
00235                                                 allocMem(m_pDocfreqPosting);            
00236                                                 m_pDocfreqPosting->add(m_freq);
00237                                         }
00238                                 }               
00239                                 else 
00240                                 {
00241                                         //第一个文档
00242                                         if(m_prevID == -1)
00243                                                 m_prevID = 0;
00244                                 }
00245                                 if(!m_pDocfreqPosting->add(docid - m_prevID))
00246                                 {
00247                                         //没内存了
00248                                         allocMem(m_pDocfreqPosting);            
00249                                         m_pDocfreqPosting->add(docid - m_prevID);
00250                                 }
00251                                 
00252                                 m_freq = 1;
00253 
00254                                 m_prevID = docid;
00255                                 
00256                                 m_df++;
00257                         }
00258                 }
00259         }
00260 }
00261 
00262 
00263 
00264 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex