FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com 00012 // Created : 2006/5/8 00013 // 00014 #ifndef _FIRTEXCOM_H 00015 #define _FIRTEXCOM_H 00016 00017 #include "../com/Com.h" 00018 00019 #ifdef __cplusplus 00020 00021 namespace firtex 00022 { 00023 namespace plugin 00024 { 00025 //ANSI字符串定义 00026 typedef struct _tagstr 00027 { 00028 char* str; 00029 size_t length; 00030 }STR; 00031 00032 //宽字符串定义 00033 typedef struct _tagwstr 00034 { 00035 wchar_t* str; 00036 size_t length; 00037 }WSTR; 00038 00039 typedef int32_t fieldid_t; 00040 00041 typedef uint8_t FIELDTYPE; 00042 00043 enum FIELDTYPEENUM 00044 { 00045 _FIELD_NONE = 0x00, //匿名 00046 _FIELD_WORD = 0x01, //普通的词,一般经过Analyzer形成WordID 00047 _FIELD_DATE = 0x02, //时间日期 00048 _FIELD_NAME = 0x03, //人名,字符串 00049 _FIELD_COMPANY = 0x04, //机构名,字符串 00050 _FIELD_EMAIL = 0x05, //Email,字符串 00051 _FIELD_NUM = 0x06, //数字 00052 _FIELD_ALPHA = 0x07, //字母,字符串 00053 _FIELD_ALPNUM = 0x08, //数字字母组合,字符串 00054 _FIELD_URL = 0x09, //URL,字符串 00055 }; 00056 00057 typedef unsigned short NUMBERTYPE; 00058 00059 //数字类型定义 00060 enum NUMENUM 00061 { 00062 NT_EMPTY = 0, 00063 NT_NULL = 1, 00064 NT_BOOL = 2, 00065 NT_I1 = 3, 00066 NT_UI1 = 4, 00067 NT_I2 = 5, 00068 NT_UI2 = 6, 00069 NT_I4 = 7, 00070 NT_UI4 = 8, 00071 NT_I8 = 9, 00072 NT_UI8 = 10, 00073 00074 NT_R4 = 11, 00075 NT_R8 = 12, 00076 }; 00077 00078 //数字定义 00079 typedef struct _tagNUMBERVAR 00080 { 00081 NUMBERTYPE nt; //Number Type 00082 union 00083 { 00084 bool bval; 00085 int8_t i8val; 00086 uint8_t ui8val; 00087 int16_t i16val; 00088 uint16_t ui16val; 00089 int32_t i32val; 00090 uint32_t ui32val; 00091 int64_t i64val; 00092 uint64_t ui64val; 00093 00094 float fval; 00095 double dbval; 00096 }value; 00097 }NUMBERVAR; 00098 00099 //二进制数值定义 00100 typedef struct _tagDATARECORD 00101 { 00102 byte* data; //data 00103 size_t length; //length of data 00104 bool nocopy; //copy data or not 00105 }DATARECORD; 00106 00107 typedef unsigned short INDEXTYPE; 00108 00109 //索引数据类型定义 00110 enum INDEXENUM 00111 { 00112 IT_EMPTY = 0, //Empty 00113 IT_STR = 1, //ANSI 00114 IT_WSTR = 2, //UNICODE 00115 IT_NUMBER = 3, //Number 00116 IT_DATARECORD = 4, //Binary data 00117 }; 00118 00119 //索引数据定义 00120 typedef struct _tagINDEXDATAVAR 00121 { 00122 INDEXTYPE it; 00123 union 00124 { 00125 struct 00126 { 00127 union 00128 { 00129 STR strval; //ANSI 00130 WSTR wstrval; //Unicode 00131 }; 00132 bool nocopy; //copy text or not 00133 }text; 00134 NUMBERVAR numval; //Number 00135 DATARECORD drval; 00136 }data; 00137 }INDEXDATAVAR; 00138 00139 typedef INDEXTYPE METATYPE; 00140 typedef INDEXDATAVAR METADATAVAR; 00141 00142 typedef unsigned int method_type; 00143 typedef method_type Store_; 00144 typedef method_type Index_; 00145 typedef method_type TermVector_; 00146 00147 enum Index 00148 { 00149 INDEX_NO = 1, 00150 INDEX_ANALYZER = 2, 00151 INDEX_UN_ANALYZER = 3, 00152 }; 00153 enum Store 00154 { 00155 STORE_YES = 1, 00156 STORE_COMPRESS = 2, 00157 STORE_NO = 3 00158 }; 00159 enum TermVector 00160 { 00161 TERMVECTOR_NO = 1, 00162 TERMVECTOR_SEQUENCE = 2, //存储文档Analyze后的原始序列 00163 TERMVECTOR_WITHOUT_POSITION = 3, //不存储位置信息,仅存储词和词频信息 00164 TERMVECTOR_WITH_POSITION = 4, //存储词,词频,词位置信息 00165 }; 00166 00167 //{09B42270-8ACD-4e8a-B125-A79B46003F5D} 00168 class IDocumentSchema : public firtex::com::IUnknown 00169 { 00170 public: 00171 static const firtex::com::FX_IID iid; 00172 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addKeywordItem(/* [in] */firtex::com::BSTR name, 00173 /* [in] */firtex::plugin::FIELDTYPE ft, 00174 /* [out] */firtex::plugin::fieldid_t* pid) = 0; 00175 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addUnIndexedItem(/* [in] */firtex::com::BSTR name, 00176 /* [out] */firtex::plugin::fieldid_t* pid) = 0; 00177 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addTextItem(/* [in] */firtex::com::BSTR name, 00178 /* [in] */firtex::plugin::FIELDTYPE ft, 00179 /* [in] */firtex::plugin::Store_ store, 00180 /* [in] */firtex::plugin::TermVector_ termVector, 00181 /* [out] */firtex::plugin::fieldid_t* pid) = 0; 00182 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addUnStoredItem(/* [in] */firtex::com::BSTR name, 00183 /* [in] */firtex::plugin::FIELDTYPE ft, 00184 /* [in] */firtex::plugin::TermVector_ termVector, 00185 /* [out] */firtex::plugin::fieldid_t* pid) = 0; 00186 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addBinaryItem(/* [in] */firtex::com::BSTR name, 00187 /* [in] */firtex::com::bool_t bCompress, 00188 /* [out] */firtex::plugin::fieldid_t* pid) = 0; 00189 }; 00190 00191 // {C92FA9F4-86FD-48c1-AB00-B16C78D2983F} 00192 class IDocument : public firtex::com::IUnknown 00193 { 00194 public: 00195 static const firtex::com::FX_IID iid; 00196 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE setSchema(/* [in] */IDocumentSchema* pSchema) = 0; 00197 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addField(/* [in] */firtex::plugin::fieldid_t id, 00198 /* [in] */firtex::plugin::INDEXDATAVAR* value) = 0; 00199 }; 00200 00201 typedef unsigned short ARGTYPE; 00202 enum ARGENUM 00203 { 00204 ARG_FILE = 1, 00205 ARG_FILEW = 2, 00206 ARG_BUFFER = 3, 00207 }; 00208 00209 typedef struct _tagINDEXPARAMETER 00210 { 00211 ARGTYPE at; 00212 union 00213 { 00214 STR buffer; //Buffer 00215 STR file; //文件路径,ANSI 00216 WSTR wfile; //文件路径,UNICODE 00217 }param; 00218 }INDEXPARAMETER; 00219 00220 // {31F417A2-5111-4e49-8D14-EA1D63F6CD5D} 00221 class IIndexParameter : public firtex::com::IUnknown 00222 { 00223 public: 00224 static const firtex::com::FX_IID iid; 00225 00226 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getParam(/* [out][retval] */ firtex::plugin::INDEXPARAMETER* pParam) = 0; 00227 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE addMetadata(/* [in] */firtex::plugin::fieldid_t id, 00228 /* [in] */firtex::plugin::METADATAVAR* pMeta) = 0; 00229 }; 00230 00231 // {750C8BE7-1858-4c61-A97C-C864CF552EFD} 00232 class IReader : public firtex::com::IUnknown 00233 { 00234 public: 00235 static const firtex::com::FX_IID iid; 00236 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE read(/* [in] [retval] */byte* data,/* [in] */int32_t length) = 0; 00237 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE readWithNoCopy(/* [out] [retval] */byte** data,/* [out] [retval] */int32_t* length) = 0; 00238 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE close() = 0; 00239 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getFilePointer(/* [out] [retval] */int64_t* position) = 0; 00240 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE seek(/* [in] */int64_t position) = 0; 00241 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE isEof(/* [out] [retval] */firtex::com::bool_t* bEof) = 0; 00242 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE length(/* [out] [retval] */int64_t* len) = 0; 00243 }; 00244 00245 enum TOKENENUM 00246 { 00247 TOKEN_NONE = 0x0, 00248 TOKEN_WORD = 0x01, //词 00249 TOKEN_DATE = 0x02, //时间日期 00250 TOKEN_NAME = 0x04, //人名 00251 TOKEN_COMPANY = 0x08, //机构名 00252 TOKEN_EMAIL = 0x10, //Email 00253 TOKEN_NUM = 0x20, //数字 00254 TOKEN_ALPHA = 0x40, //字母 00255 TOKEN_ALPNUM = 0x80, //数字字母组合 00256 TOKEN_URL = 0x100,//URL 00257 }; 00258 00259 00260 typedef uint32_t TOKENTYPE; 00261 typedef int32_t TERMID; 00262 // {E6F0F10C-DF0B-411a-9A3E-EA2BCD4DF256} 00263 class ITokens : public firtex::com::IUnknown 00264 { 00265 public: 00266 static const firtex::com::FX_IID iid; 00267 00268 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getType(/* [out] [retval] */TOKENTYPE* type) = 0; 00269 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE setType(/* [in] */TOKENTYPE type) = 0; 00270 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getCapacity(/* [out] [retval]*/int32_t* cap) = 0; 00271 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getTokenNum(/* [out] [retval]*/int32_t* numTokens) = 0; 00272 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getMaxTokens(/* [out] [retval]*/int32_t* maxTokens) = 0; 00273 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE setMaxTokens(/* [in] */int32_t maxTokens) = 0; 00274 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE clear() = 0; 00275 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getBuffer(/* [out] [retval] */byte** buf) = 0; 00276 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE appendWord(/* [in] */TERMID tid,/* [out] [retval] */firtex::com::bool_t* bSuc) = 0; 00277 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE appendDate(/* [in] */int64_t dt,/* [out] [retval] */firtex::com::bool_t* bSuc) = 0; 00278 }; 00279 00280 //{D6CC19E5-3FEA-4c96-B50C-C176080E89C5} 00281 class IParserPlugin : public firtex::com::IUnknown 00282 { 00283 public: 00284 static const firtex::com::FX_IID iid; 00285 00286 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getCategory(/* [out] [retval] */firtex::com::BSTR* retVal) = 0; 00287 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getIdentifier(/* [out] [retval] */firtex::com::BSTR* retVal) = 0; 00288 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getFileType(/* [out] [retval] */firtex::com::BSTR* retVal) = 0; 00289 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE setTermVector(/* [in] */firtex::com::BSTR field,/* [in] */firtex::plugin::TermVector_ tv) = 0; 00290 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getTermVector(/* [in] */firtex::com::BSTR field,/* [out] [retval] */firtex::plugin::TermVector_* ptv) = 0; 00291 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE defineSchema(/* [in] [out] [retval] */IDocumentSchema* pSchema) = 0; 00292 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE parseInternal(/* [in] */IDocument* pDoc, 00293 /* [in] */IIndexParameter* param) = 0; 00294 }; 00295 00296 // {7D7D876B-CCE4-4fe0-8389-A20106A828A3} 00297 class IAnalyzerPlugin : public firtex::com::IUnknown 00298 { 00299 public: 00300 static const firtex::com::FX_IID iid; 00301 00302 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getCategory(/* [out] [retval] */firtex::com::BSTR* retVal) = 0; 00303 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getIdentifier(/* [out] [retval] */firtex::com::BSTR* retVal) = 0; 00304 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE getTokenType(/* [out] [retval] */TOKENTYPE* retVal) = 0; 00305 virtual firtex::com::FX_HRESULT FX_STDMETHODCALLTYPE nextTokensInternal(/* [in] */IReader* reader,/* [in] [out] [retval]*/ITokens* tokens) = 0; 00306 }; 00307 } 00308 } 00309 #endif //__cplusplus 00310 00311 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex