FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn 00012 // Created : 2005/11/20 00013 // 00014 #ifndef __POSTING_H 00015 #define __POSTING_H 00016 00017 #include "../utility/StdHeader.h" 00018 #include "../store/IndexOutput.h" 00019 #include "../utility/MemCache.h" 00020 #include <vector> 00021 #include <fstream> 00022 00023 00024 using namespace std; 00025 using namespace firtex; 00026 using namespace firtex::store; 00027 00028 #define DOCFREQ_DEFAULT 5 //doc freq posting 初始块大小 00029 #define POS_DEFAULT 8 //pos posting初始块大小 00030 00031 #define CACHE_GROWSIZE 262144//30000000//32768 //32K 00032 00033 00034 namespace firtex 00035 { 00036 namespace index 00037 { 00038 #pragma pack(push,1) 00039 typedef struct _tagPOSTINGNODE 00040 { 00041 uint8_t* begin; 00042 uint8_t* end; 00043 size_t size; 00044 _tagPOSTINGNODE* next; 00045 }POSTINGNODE; 00046 #pragma pack(pop) 00047 00048 class CPostingList 00049 { 00050 public: 00051 CPostingList(int32_t nodesize) 00052 { 00053 m_pPostHead = NULL; 00054 m_pPostTail = NULL; 00055 m_iNextAllocSize = nodesize; 00056 m_iDefaultAllocSize = nodesize; 00057 } 00058 ~CPostingList() 00059 { 00060 } 00061 public: 00062 bool add(int32_t i) 00063 { 00064 size_t left = m_pPostTail->size - (m_pPostTail->end - m_pPostTail->begin); 00065 if(left < 4)//最少要4个空闲 00066 return false; 00067 00068 //变长存储int32_t 00069 while ((i & ~ 0x7F) != 0) 00070 { 00071 *(m_pPostTail->end) = ((uint8_t) ((i & 0x7f) | 0x80)); 00072 m_pPostTail->end++; 00073 i = (int32_t) (((uint32_t) i) >> 7); 00074 } 00075 *(m_pPostTail->end) = ((uint8_t)i); 00076 m_pPostTail->end++; 00077 return true; 00078 } 00079 void addNode(POSTINGNODE* pNode) 00080 { 00081 if(m_pPostTail) 00082 m_pPostTail->next = pNode; 00083 m_pPostTail = pNode; 00084 if(!m_pPostHead) 00085 m_pPostHead = m_pPostTail; 00086 } 00087 void reset() 00088 { 00089 m_pPostHead = m_pPostTail = NULL; 00090 m_iNextAllocSize = m_iDefaultAllocSize; 00091 } 00092 protected: 00093 POSTINGNODE* m_pPostHead; 00094 POSTINGNODE* m_pPostTail; 00095 int32_t m_iNextAllocSize; 00096 int32_t m_iDefaultAllocSize; 00097 00098 friend class CPosting; 00099 }; 00100 00101 00102 class CPosting 00103 { 00104 public: 00105 typedef CPosMemCache memcache_type; 00106 public: 00107 CPosting(memcache_type* mc); 00108 00109 ~CPosting(); 00110 public: 00116 void addLocation(docid_t docid, loc_t location); 00117 00122 void addDocument(docid_t docid); 00123 00124 //判断有没有分配内存 00125 bool hasNoMem(); 00126 bool allocMem(CPostingList* posting); 00127 bool allocMem(); 00128 00129 count_t docFreq()const{return m_df;}; 00130 docid_t lastDocID(){return m_prevID;} 00131 00132 int32_t writeDocFreq(CIndexOutput* dfiOutput); 00133 int32_t writePosition(CIndexOutput* ptiOutput); 00134 00135 void reset(); 00136 00137 protected: 00138 CPostingList* m_pDocfreqPosting; 00139 CPostingList* m_pPosPosting; 00140 00141 00142 memcache_type* m_pMemCache; 00143 00144 count_t m_df; //文档频率 00145 00146 loc_t m_freq; //当前term freq 00147 00148 loc_t m_prevID; 00149 loc_t m_prevPos; 00150 00151 int32_t m_iMemSize; 00152 static size_t m_iNextGrowSize; 00153 00154 int32_t m_LOC_Tsize; 00155 }; 00156 00158 //inline functions 00159 inline bool CPosting::hasNoMem() 00160 { 00161 if(m_iMemSize == 0) 00162 return true; 00163 return false; 00164 } 00165 inline void CPosting::addLocation(docid_t docid, loc_t location) 00166 { 00167 //判断是否旧文档 00168 if (docid == m_prevID) 00169 { 00170 if(!m_pPosPosting->add(location-m_prevPos)) 00171 { 00172 //没内存了 00173 allocMem(m_pPosPosting); 00174 m_pPosPosting->add(location-m_prevPos);//差量编码 00175 } 00176 00177 m_freq++; 00178 m_prevPos = location; 00179 } 00180 else//新文档 00181 { 00182 if(m_freq > 0)//写上一文档的term freq 00183 { 00184 if(!m_pDocfreqPosting->add(m_freq)) 00185 { 00186 //没内存了 00187 allocMem(m_pDocfreqPosting); 00188 m_pDocfreqPosting->add(m_freq); 00189 } 00190 } 00191 else 00192 { 00193 //第一个文档 00194 if(m_prevID == -1) 00195 m_prevID = 0; 00196 } 00197 if(!m_pDocfreqPosting->add(docid - m_prevID)) 00198 { 00199 //没内存了 00200 allocMem(m_pDocfreqPosting); 00201 m_pDocfreqPosting->add(docid - m_prevID); 00202 } 00203 if(!m_pPosPosting->add(location)) 00204 { 00205 //没内存了 00206 allocMem(m_pPosPosting); 00207 m_pPosPosting->add(location); 00208 } 00209 00210 m_freq = 1; 00211 00212 m_prevID = docid; 00213 00214 //保存这个位置,方便后面差量编码 00215 m_prevPos = location; 00216 00217 m_df++; 00218 } 00219 } 00220 00221 inline void CPosting::addDocument(docid_t docid) 00222 { 00223 //判断是否旧文档 00224 if (docid == m_prevID) 00225 { 00226 m_freq++; 00227 } 00228 else//新文档 00229 { 00230 if(m_freq > 0)//写上一文档的term freq 00231 { 00232 if(!m_pDocfreqPosting->add(m_freq)) 00233 { 00234 //没内存了 00235 allocMem(m_pDocfreqPosting); 00236 m_pDocfreqPosting->add(m_freq); 00237 } 00238 } 00239 else 00240 { 00241 //第一个文档 00242 if(m_prevID == -1) 00243 m_prevID = 0; 00244 } 00245 if(!m_pDocfreqPosting->add(docid - m_prevID)) 00246 { 00247 //没内存了 00248 allocMem(m_pDocfreqPosting); 00249 m_pDocfreqPosting->add(docid - m_prevID); 00250 } 00251 00252 m_freq = 1; 00253 00254 m_prevID = docid; 00255 00256 m_df++; 00257 } 00258 } 00259 } 00260 } 00261 00262 00263 00264 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex