FirteX-高性能全文索引和检索平台

API Documentation


首页 | 名字空间列表 | 类继承关系 | 组合类型列表 | $(BL\录(B | 文件列表 | 名字空间成员 | 组合类型成员 | 文件成员

Parser.h

浏览该文件的文档。
00001 //
00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 
00003 // All rights reserved.
00004 // This file is part of FirteX (www.firtex.org)
00005 //
00006 // Use of the FirteX is subject to the terms of the software license set forth in 
00007 // the LICENSE file included with this software, and also available at
00008 // http://www.firtex.org/license.html
00009 //
00010 // Author       : 郭瑞杰(GuoRuijie)
00011 // Email        : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com
00012 // Created      : 2005/11/20
00013 //
00014 
00015 #ifndef _PARSER_H
00016 #define _PARSER_H
00017 
00018 #if _MSC_VER > 1000
00019 #pragma once
00020 #endif // _MSC_VER > 1000
00021 
00022 
00023 #include "../utility/StdHeader.h"
00024 #include "../document/Document.h"
00025 #include "../document/Field.h"
00026 #include "../utility/FileReader.h"
00027 #include "../utility/StringReader.h"
00028 #include "../utility/Config.h"
00029 #include "../index/IndexParameter.h"
00030 #include "../document/DocumentSchema.h"
00031 #include <map>
00032 using namespace std;
00033 using namespace firtex::utility;
00034 using namespace firtex::index;
00035 using namespace firtex::document;
00036 
00037 namespace firtex
00038 {
00039         namespace parser
00040         {
00044                 class CParser
00045                 {
00046                 public:
00047                         static const tstring category;
00048                         static const tstring identifier;
00049 
00050                         CParser(void);
00051                         virtual~CParser(void);
00052                 public:
00056                         virtual tstring         getCategory(){ return category; }       
00057 
00061                         virtual tstring         getIdentifier(){ return m_identifier; }
00062                 public:                 
00068                         firtex::document::CDocument*    parse(CIndexParameter* pArg);
00069 
00073                         virtual void            close();
00074 
00080                         virtual void            setTermVector(const tchar* field,TermVector_ tv);
00081 
00087                         virtual TermVector_     getTermVector(const tchar* field);
00088                 protected:
00094                         virtual bool    parseInternal(CIndexParameter* pArg) = 0;
00095 
00100                         virtual void    defineSchema(CDocumentSchema* pSchema) = 0;
00101 
00105                         virtual tstring         getFileType() = 0;
00106                 protected:
00107                         firtex::document::CDocument*    getDocument();          
00108                 protected:
00109                         firtex::document::CDocument*    m_pCachedDoc;   
00110                         tstring         m_identifier;
00111                         map<tstring,TermVector_>                m_termVectors;
00112                 };
00113 
00115                 //inline functions
00116                 inline firtex::document::CDocument* CParser::getDocument()
00117                 {
00118                         if(m_pCachedDoc == NULL)
00119                         {
00120                                 m_pCachedDoc = new firtex::document::CDocument();                               
00121                         }
00122                         return m_pCachedDoc;
00123                 }                       
00124                 inline firtex::document::CDocument* CParser::parse(CIndexParameter* pArg)
00125                 {
00126                         try
00127                         {
00128                                 FIRTEX_LOG(GlobalConfig.Logger,level::dbg) << "Parsing..."<<endl;
00129                                 
00130                                 firtex::document::CDocument* pDoc = getDocument();
00131                                 if(pArg->isSchemaModified())
00132                                 {
00133                                         CDocumentSchema schema;
00134                                         if(pArg->getSchema())
00135                                                 schema = *(pArg->getSchema());
00136                                         else FIRTEX_CLOG(level::warn) << "Havn't define schema of metadata. " << endl;
00137                                         defineSchema(&schema);                          //由子类定义文档模式
00138                                         pDoc->setSchema(schema);                        //设置文档的模式                                        
00139                                         pArg->setSchemaModified(false);         //修改schema修改标志
00140                                 }
00141                                 if(parseInternal(pArg))
00142                                 {                                       
00143                                         pArg->startIterator();
00144                                         while (pArg->hasNext())
00145                                         {
00146                                                 CMetadata* pMeta = pArg->next();
00147                                                 switch(pMeta->mt)
00148                                                 {
00149                                                 case MT_STR:
00150                                                         pDoc->addField(pMeta->Id,(str_t)(*pMeta),true);
00151                                                         break;
00152                                                 case MT_WSTR:
00153                                                         pDoc->addField(pMeta->Id,(wstr_t)(*pMeta),true);
00154                                                         break;
00155                                                 case MT_NUMBER:
00156                                                         pDoc->addField(pMeta->Id,(numbervar)(*pMeta));
00157                                                         break;
00158                                                 case MT_DATARECORD:
00159                                                         pDoc->addField(pMeta->Id,(datarecord)(*pMeta),true);
00160                                                         break;
00161                                                 }                                               
00162                                         }
00163                                         return pDoc;
00164                                 }
00165                                 return NULL;
00166                         }
00167                         catch (CFileIOException& e) 
00168                         {
00169                                 FIRTEX_LOG(GlobalConfig.Logger,level::err) << "Parse doc file error: " << ( (pArg->getParam().at ==param::File)?pArg->getParam().file.c_str():"") << endl << "error code:" << e.what() << endl;
00170                         }               
00171                         catch(...)
00172                         {
00173                                 FIRTEX_LOG(GlobalConfig.Logger,level::err) << "Parse doc file error: " << ( (pArg->getParam().at ==param::File)?pArg->getParam().file.c_str():"") << endl;
00174                         }
00175 
00176                         return NULL;
00177                 }
00178         }
00179 }
00180 
00181 
00182 #endif

http://www.firtex.org http://www.sourceforge.net/projects/firtex