// ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Lemmatizer (www.aot.ru), // ========== Copyright by Alexey Sokirko (2004) #include "StdMorph.h" #include "MorphAutomat.h" #ifdef DETECT_MEMORY_LEAK #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif #endif static int InitAlphabet(MorphLanguageEnum Language, int* pCode2Alphabet, int *pAlphabet2Code, bool bIncludeAnnotatorChar) { assert (!is_upper_alpha(AnnotChar, Language)); string AdditionalEnglishChars = "'1234567890"; string AdditionalGermanChars = ""; int AlphabetSize = 0; for (size_t i=0; i < 256; i++) { if ( is_upper_alpha((BYTE)i, Language) || (i == '-') || (bIncludeAnnotatorChar && (i == AnnotChar)) || ( (Language == morphEnglish) && (AdditionalEnglishChars.find(i) != string::npos) ) || ( (Language == morphGerman) && (AdditionalGermanChars.find(i) != string::npos) ) ) { pCode2Alphabet[AlphabetSize] = i; pAlphabet2Code[i] = AlphabetSize; AlphabetSize++; } else pAlphabet2Code[i] = -1; }; if (AlphabetSize > MaxAlphabetSize) { string Error = "Error! The ABC is too large"; ErrorMessage (Error); throw CExpc(Error); }; return AlphabetSize; }; string CABCEncoder::EncodeIntToAlphabet(DWORD v) const { string Result; if (v == 0) { Result.push_back(m_Code2AlphabetWithoutAnnotator[0]); const char* debug = Result.c_str(); return Result; } else while (v > 0) { Result.push_back(m_Code2AlphabetWithoutAnnotator[v%m_AlphabetSizeWithoutAnnotator]); v /= m_AlphabetSizeWithoutAnnotator; }; return Result; }; DWORD CABCEncoder::DecodeFromAlphabet(const string& v) const { size_t len = v.length(); int c = 1; int Result = 0; for (size_t i=0; i 0 ) DumpAllStringsRecursive(fp, 0, ""); fclose (fp); return true; }; int CMorphAutomat::NextNode(int NodeNo, BYTE RelationChar) const { if (NodeNo < ChildrenCacheSize) { int z = m_Alphabet2Code[RelationChar]; if (z == -1) return -1; return m_ChildrenCache[NodeNo*MaxAlphabetSize+z]; } else { const CMorphAutomRelation* start = m_pRelations +m_pNodes[NodeNo].GetChildrenStart(); const CMorphAutomRelation* end = start + GetChildrenCount(NodeNo); for (; start != end; start++) { const CMorphAutomRelation& p = *start; if (RelationChar == p.GetRelationalChar()) return p.GetChildNo(); }; return -1; }; }; void CMorphAutomat::GetAllMorphInterpsRecursive (int NodeNo, string& curr_path, vector& Infos) const { const CMorphAutomNode& N = m_pNodes[NodeNo]; if (N.IsFinal()) { CAutomAnnotationInner A; DWORD i = DecodeFromAlphabet(curr_path); size_t ItemNo; size_t ModelNo; size_t PrefixNo; DecodeMorphAutomatInfo(i, ModelNo, ItemNo, PrefixNo); A.m_ItemNo = ItemNo; A.m_ModelNo = ModelNo; A.m_PrefixNo = PrefixNo; Infos.push_back(A); }; size_t Count = GetChildrenCount(NodeNo); size_t CurrPathSize = curr_path.size(); curr_path.resize(CurrPathSize + 1); for (size_t i=0; i& Infos) const { Infos.clear(); int r = FindStringAndPassAnnotChar(Text, TextPos); if ( r == -1) return; // getting all interpretations string curr_path; GetAllMorphInterpsRecursive(r, curr_path, Infos); //assert (!Infos.empty()); //sort(Infos.begin(),Infos.end()); }; DWORD CMorphAutomat::EncodeMorphAutomatInfo (size_t ModelNo, size_t ItemNo, size_t PrefixNo) const { return (((DWORD)((WORD)((DWORD)(ModelNo) & 0xffff))) << 18) | (((DWORD)((WORD)((DWORD)(ItemNo) & 0xffff))) << 9) | PrefixNo; }; void CMorphAutomat::DecodeMorphAutomatInfo (DWORD Info, size_t& ModelNo, size_t& ItemNo, size_t& PrefixNo) const { ModelNo = Info >>18; ItemNo = (0x3FFFF&Info) >>9; PrefixNo = (0x1FF&Info); };