/* * Copyright (c) 2006 Andrei V. Shethin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the Andrei V. Shethin Team nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * LemClient.hpp * * $lem$ */ // C++ Includes #include #include "common/utilit.h" #include "AgramtabLib/RusGramTab.h" #include "AgramtabLib/EngGramTab.h" #include "AgramtabLib/GerGramTab.h" #include "LemmatizerLib/Lemmatizers.h" #include "LemmatizerLib/Paradigm.h" namespace lem { // // Constructor // Lemmatizer::Lemmatizer(const std::string sRML, int iILanguages): iLanguages(iILanguages) { pRusLem = NULL; pRusGramTab = NULL; pGerLem = NULL; pGerGramTab = NULL; pEngLem = NULL; pEngGramTab = NULL; try { if (iLanguages & C_RUS_GRAM_TAB) { std::string sError; pRusLem = new CLemmatizerRussian; pRusGramTab = new CRusGramTab; if (!pRusLem -> LoadDictionariesRegistry(sError)) { throw std::logic_error(sError.c_str()); } if (!pRusGramTab -> LoadFromRegistry()) { throw std::logic_error("Cannot load Russian gramtab."); } } if (iLanguages & C_GER_GRAM_TAB) { std::string sError; pGerLem = new CLemmatizerGerman; pGerGramTab = new CGerGramTab; if (!pGerLem -> LoadDictionariesRegistry(sError)) { throw std::logic_error(sError.c_str()); } if (!pGerGramTab -> LoadFromRegistry()) { throw std::logic_error("Cannot load German gramtab."); } } if (iLanguages & C_ENG_GRAM_TAB) { std::string sError; pEngLem = new CLemmatizerEnglish; pEngGramTab = new CEngGramTab; if (!pEngLem -> LoadDictionariesRegistry(sError)) { throw std::logic_error(sError.c_str()); } if (!pEngGramTab -> LoadFromRegistry()) { throw std::logic_error("Cannot load English gramtab."); } } } catch(CExpc &e) { throw std::logic_error(e.m_strCause.c_str()); } } // // A destructor // Lemmatizer::~Lemmatizer() throw() { if (pRusLem != NULL) { delete pRusLem; delete pRusGramTab; } if (pGerLem != NULL) { delete pGerLem; delete pGerGramTab; } if (pEngLem != NULL) { delete pEngLem; delete pEngGramTab; } } // // Get supported languages // int Lemmatizer::GetLanguages() { return iLanguages; } // // Lemmatize word and get ONLY array of first forms // std::vector Lemmatizer::LemmatizeQuick(std::string & sWord, bool bUseParadigms) { std::vector vParadigms; std::vector vResult; CLemmatizer * pLemmatizer; CAgramtab * pAgramtab; if (!sWord.length()) { return vResult; } if ((sWord[0] >= 'a' && sWord[0] <= 'z') || (sWord[0] >= 'A' && sWord[0] <= 'Z')) { pLemmatizer = pEngLem; pAgramtab = pEngGramTab; } else { pLemmatizer = pRusLem; pAgramtab = pRusGramTab; } pLemmatizer -> CreateParadigmCollection(false, sWord, true, vParadigms); for (int i = 0; i < vParadigms.size(); ++i) { string sGramCodes = vParadigms[i].GetSrcAncode(); LemmResultQuick oResult; oResult.vocabulary = vParadigms[i].m_bFound; oResult.first_form = vParadigms[i].GetWordForm(0); oResult.word_form = pAgramtab -> GetPartOfSpeech(sGramCodes.c_str()); vResult.push_back(oResult); } return vResult; } // // Lemmatized word with all word forms // std::vector Lemmatizer::FirstFormsQuick(std::string & sWord, bool bUseParadigms) { std::vector vResult; CLemmatizer * pLemmatizer; if (!sWord.length()) { return vResult; } if ((sWord[0] >= 'a' && sWord[0] <= 'z') || (sWord[0] >= 'A' && sWord[0] <= 'Z')) { pLemmatizer = pEngLem; } else { pLemmatizer = pRusLem; } pLemmatizer -> m_bUsePrediction = !bUseParadigms; pLemmatizer -> GetAllAncodesAndLemmasQuick(sWord, true, szBuffer, C_BUFFER_SIZE); char * szTMP = szBuffer; char * szBegin = szBuffer; for(;;) { while (*szTMP && *szTMP != ' ') { szTMP++; } if (!*szTMP) { vResult.push_back(szBegin); break; } *szTMP = '\0'; ++szTMP; vResult.push_back(szBegin); while (*szTMP && *szTMP != '#') { szTMP++; } szBegin = ++szTMP; if (!*szTMP) { break; } } return vResult; } } // namespace lem // End.