FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : Kevin Zhang,郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com 00012 // Created : 2005/11/20 00013 // 00014 00015 #ifndef _CHINESEANALYZER_H 00016 #define _CHINESEANALYZER_H 00017 00018 #if _MSC_VER > 1000 00019 #pragma once 00020 #endif // _MSC_VER > 1000 00021 00022 #include "../utility/StdHeader.h" 00023 #include "Analyzer.h" 00024 00025 #define _CHARSET_SIZE 65536 00026 00027 namespace firtex 00028 { 00029 namespace analyzer 00030 { 00031 class CChineseAnalyzer : public CAnalyzer 00032 { 00033 public: 00034 static const string category; 00035 static const string identifier; 00036 private: 00037 struct state 00038 {//state information in double-array trie 00039 int base;//base value 00040 int check;//check value 00041 int handle;//handle for dictionary entry 00042 }; 00043 typedef struct state STATE,*PSTATE; 00044 public: 00045 CChineseAnalyzer(const tchar *sDicName,CParser* pParser = NULL); 00046 CChineseAnalyzer(CParser* pParser = NULL); 00047 virtual ~CChineseAnalyzer(void); 00048 public: 00053 TokenType getTokenType(){return TOKEN_WORD;}; 00054 00058 void close(); 00059 protected: 00066 CTokens* nextTokensInternal(CReader* reader,CTokens* pInput); 00067 public: 00069 wordid_t getWordId(const char *cWord); 00070 protected: 00071 bool Load(const tchar *sFilename); 00072 protected: 00073 PSTATE m_pData; 00074 int m_nLength;//Array length 00075 int m_nLowerBound;//Lower bound in the array 00076 int m_nItemCount;//Item Count 00077 private: 00078 int m_charsetSize; 00079 int m_charsetfreq[_CHARSET_SIZE],m_freq[_CHARSET_SIZE]; 00080 int m_LowerChar,m_UpperChar; 00081 int m_charset[_CHARSET_SIZE]; 00082 }; 00083 00084 } 00085 } 00086 00087 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex