FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

ChineseAnalyzer.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : Kevin Zhang,郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com
00012 // Created      : 2005/11/20
00013 //
00014 
00015 #ifndef _CHINESEANALYZER_H
00016 #define _CHINESEANALYZER_H
00017 
00018 #if _MSC_VER > 1000
00019 #pragma once
00020 #endif // _MSC_VER > 1000
00021 
00022 #include "../utility/StdHeader.h"
00023 #include "Analyzer.h"
00024 
00025 #define _CHARSET_SIZE 65536
00026 
00027 namespace firtex
00028 {
00029         namespace analyzer
00030         {
00031                 class CChineseAnalyzer : public CAnalyzer
00032                 {
00033                 public:
00034                         static const string category;
00035                         static const string identifier;         
00036                 private:
00037                         struct state
00038                         {//state information in double-array trie
00039                                 int base;//base value
00040                                 int check;//check value
00041                                 int handle;//handle for dictionary entry
00042                         };
00043                         typedef struct state STATE,*PSTATE;
00044                 public:
00045                         CChineseAnalyzer(const tchar *sDicName,CParser* pParser = NULL);
00046                         CChineseAnalyzer(CParser* pParser = NULL);
00047                         virtual ~CChineseAnalyzer(void);                        
00048                 public:
00053                         TokenType       getTokenType(){return TOKEN_WORD;};             
00054 
00058                         void            close();
00059                 protected:
00066                         CTokens*        nextTokensInternal(CReader* reader,CTokens* pInput);            
00067                 public:
00069                         wordid_t        getWordId(const char *cWord);
00070                 protected:
00071                         bool            Load(const tchar *sFilename);
00072                 protected:              
00073                         PSTATE  m_pData;
00074                         int             m_nLength;//Array length
00075                         int             m_nLowerBound;//Lower bound in the array
00076                         int             m_nItemCount;//Item Count
00077                 private:
00078                         int             m_charsetSize;
00079                         int             m_charsetfreq[_CHARSET_SIZE],m_freq[_CHARSET_SIZE];
00080                         int             m_LowerChar,m_UpperChar;
00081                         int             m_charset[_CHARSET_SIZE];
00082                 };
00083 
00084         }
00085 }
00086 
00087 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex