FirteX-高性能全文索引和检索平台API Documentation |
00001 // 00002 // Copyright(C) 2005--2006 Institute of Computing Tech, Chinese Academy of Sciences. 00003 // All rights reserved. 00004 // This file is part of FirteX (www.firtex.org) 00005 // 00006 // Use of the FirteX is subject to the terms of the software license set forth in 00007 // the LICENSE file included with this software, and also available at 00008 // http://www.firtex.org/license.html 00009 // 00010 // Author : 郭瑞杰(GuoRuijie) 00011 // Email : ruijieguo@software.ict.ac.cn,ruijieguo@gmail.com 00012 // Created : 2005/11/20 00013 // 00014 00015 #ifndef _PARSER_H 00016 #define _PARSER_H 00017 00018 #if _MSC_VER > 1000 00019 #pragma once 00020 #endif // _MSC_VER > 1000 00021 00022 00023 #include "../utility/StdHeader.h" 00024 #include "../document/Document.h" 00025 #include "../document/Field.h" 00026 #include "../utility/FileReader.h" 00027 #include "../utility/StringReader.h" 00028 #include "../utility/Config.h" 00029 #include "../index/IndexParameter.h" 00030 #include "../document/DocumentSchema.h" 00031 #include <map> 00032 using namespace std; 00033 using namespace firtex::utility; 00034 using namespace firtex::index; 00035 using namespace firtex::document; 00036 00037 namespace firtex 00038 { 00039 namespace parser 00040 { 00044 class CParser 00045 { 00046 public: 00047 static const tstring category; 00048 static const tstring identifier; 00049 00050 CParser(void); 00051 virtual~CParser(void); 00052 public: 00056 virtual tstring getCategory(){ return category; } 00057 00061 virtual tstring getIdentifier(){ return m_identifier; } 00062 public: 00068 firtex::document::CDocument* parse(CIndexParameter* pArg); 00069 00073 virtual void close(); 00074 00080 virtual void setTermVector(const tchar* field,TermVector_ tv); 00081 00087 virtual TermVector_ getTermVector(const tchar* field); 00088 protected: 00094 virtual bool parseInternal(CIndexParameter* pArg) = 0; 00095 00100 virtual void defineSchema(CDocumentSchema* pSchema) = 0; 00101 00105 virtual tstring getFileType() = 0; 00106 protected: 00107 firtex::document::CDocument* getDocument(); 00108 protected: 00109 firtex::document::CDocument* m_pCachedDoc; 00110 tstring m_identifier; 00111 map<tstring,TermVector_> m_termVectors; 00112 }; 00113 00115 //inline functions 00116 inline firtex::document::CDocument* CParser::getDocument() 00117 { 00118 if(m_pCachedDoc == NULL) 00119 { 00120 m_pCachedDoc = new firtex::document::CDocument(); 00121 } 00122 return m_pCachedDoc; 00123 } 00124 inline firtex::document::CDocument* CParser::parse(CIndexParameter* pArg) 00125 { 00126 try 00127 { 00128 FIRTEX_LOG(GlobalConfig.Logger,level::dbg) << "Parsing..."<<endl; 00129 00130 firtex::document::CDocument* pDoc = getDocument(); 00131 if(pArg->isSchemaModified()) 00132 { 00133 CDocumentSchema schema; 00134 if(pArg->getSchema()) 00135 schema = *(pArg->getSchema()); 00136 else FIRTEX_CLOG(level::warn) << "Havn't define schema of metadata. " << endl; 00137 defineSchema(&schema); //由子类定义文档模式 00138 pDoc->setSchema(schema); //设置文档的模式 00139 pArg->setSchemaModified(false); //修改schema修改标志 00140 } 00141 if(parseInternal(pArg)) 00142 { 00143 pArg->startIterator(); 00144 while (pArg->hasNext()) 00145 { 00146 CMetadata* pMeta = pArg->next(); 00147 switch(pMeta->mt) 00148 { 00149 case MT_STR: 00150 pDoc->addField(pMeta->Id,(str_t)(*pMeta),true); 00151 break; 00152 case MT_WSTR: 00153 pDoc->addField(pMeta->Id,(wstr_t)(*pMeta),true); 00154 break; 00155 case MT_NUMBER: 00156 pDoc->addField(pMeta->Id,(numbervar)(*pMeta)); 00157 break; 00158 case MT_DATARECORD: 00159 pDoc->addField(pMeta->Id,(datarecord)(*pMeta),true); 00160 break; 00161 } 00162 } 00163 return pDoc; 00164 } 00165 return NULL; 00166 } 00167 catch (CFileIOException& e) 00168 { 00169 FIRTEX_LOG(GlobalConfig.Logger,level::err) << "Parse doc file error: " << ( (pArg->getParam().at ==param::File)?pArg->getParam().file.c_str():"") << endl << "error code:" << e.what() << endl; 00170 } 00171 catch(...) 00172 { 00173 FIRTEX_LOG(GlobalConfig.Logger,level::err) << "Parse doc file error: " << ( (pArg->getParam().at ==param::File)?pArg->getParam().file.c_str():"") << endl; 00174 } 00175 00176 return NULL; 00177 } 00178 } 00179 } 00180 00181 00182 #endif
http://www.firtex.org http://www.sourceforge.net/projects/firtex