// ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Lemmatizer (www.aot.ru), // ========== Copyright by Alexey Sokirko (2004) #include "StdMorph.h" #include "MorphDict.h" //====================================================== //============= CMorphDict ============= //====================================================== CMorphDict::CMorphDict(MorphLanguageEnum Language) : m_SearchInfoLess(m_Bases) { m_pFormAutomat = 0; }; CMorphDict::~CMorphDict() { if (m_pFormAutomat != 0) delete m_pFormAutomat; m_pFormAutomat = 0; }; void CMorphDict::InitAutomat(CMorphAutomat* pFormAutomat) { assert (m_pFormAutomat == 0); assert (pFormAutomat != 0); m_pFormAutomat = pFormAutomat; }; void CMorphDict::GetLemmaInfos (const string& Text, size_t TextPos, vector& Infos) const { const size_t TextLength = Text.length(); size_t Count = Infos.size(); for (size_t i=0; i::const_iterator start = m_LemmaInfos.begin()+m_ModelsIndex[A.m_ModelNo]; vector::const_iterator end = m_LemmaInfos.begin()+m_ModelsIndex[A.m_ModelNo+1]; vector::const_iterator it = lower_bound (start, end, Base.c_str(), m_SearchInfoLess); assert (it != m_LemmaInfos.end()); { int LemmaStrNo = it->m_LemmaStrNo; assert (Base == m_Bases[LemmaStrNo].GetString()); } Infos[i].m_LemmaInfoNo = it-m_LemmaInfos.begin(); }; }; void CMorphDict::PredictBySuffix (const string& Text, size_t& TextPos, size_t MinimalPredictSuffixlen, vector& Infos) const { size_t Count = Text.length(); for (TextPos=1; TextPos+MinimalPredictSuffixlen<=Count; TextPos++) { m_pFormAutomat->GetInnerMorphInfos(Text, TextPos, Infos); if (!Infos.empty()) break; }; }; inline size_t get_size_in_bytes (const CLemmaInfoAndLemma& t) { return get_size_in_bytes(t.m_LemmaInfo) + get_size_in_bytes(t.m_LemmaStrNo); }; inline size_t save_to_bytes(const CLemmaInfoAndLemma& t, BYTE* buf) { buf += save_to_bytes(t.m_LemmaInfo, buf); buf += save_to_bytes(t.m_LemmaStrNo, buf); return get_size_in_bytes(t); }; inline size_t restore_from_bytes(CLemmaInfoAndLemma& t, const BYTE* buf) { buf += restore_from_bytes(t.m_LemmaInfo, buf); buf += restore_from_bytes(t.m_LemmaStrNo, buf); return get_size_in_bytes(t); }; // This procedure builds CMorphDict::m_ModelsIndex, which is an index to CMorphDict::m_LemmaInfos // * m_LemmaInfos is sorted by m_LemmaInfo.m_FlexiaModelNo // * CMorphDict::m_ModelsIndex stores all periods of CMorphDict::m_ModelsIndex with equal m_LemmaInfo.m_FlexiaModelNo // * if a=m_ModelsIndex[i] and b=m_ModelsIndex[i+1], then for each j (a<=jLoad(MakeFName(GrammarFileName,"forms_autom"))) return false; string PrecompiledFile = MakeFName(GrammarFileName,"annot"); FILE * fp = fopen(PrecompiledFile.c_str(), "rb"); if (!fp) { ErrorMessage (Format("Cannot open %s", PrecompiledFile.c_str())); return false; }; ReadFlexiaModels(fp, m_FlexiaModels); ReadAccentModels(fp, m_AccentModels); int Count; char buffer[256]; { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } // add empty prefix m_Prefixes.resize(1,""); for (size_t i=0; i < Count; i++) { char buffer[256]; if (!fgets(buffer, 256, fp)) return false; string q = buffer; Trim(q); assert (!q.empty()); m_Prefixes.push_back(q); }; { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } m_LemmaInfos.clear(); ReadVectorInner(fp, m_LemmaInfos, Count); { if (!fgets(buffer, 256, fp)) return false; Count = atoi(buffer); } m_NPSs.clear(); ReadVectorInner(fp, m_NPSs, Count); assert (m_NPSs.size() == m_FlexiaModels.size()); fclose(fp); m_Bases.ReadShortStringHolder(MakeFName(GrammarFileName,"bases")); CreateModelsIndex(); return true; }; bool CMorphDict::Save(string GrammarFileName) const { try { if (!m_pFormAutomat->Save(MakeFName(GrammarFileName,"forms_autom"))) { ErrorMessage (Format("Cannot write to %s", MakeFName(GrammarFileName,"forms_autom").c_str())); return false; } string PrecompiledFile = MakeFName(GrammarFileName,"annot"); FILE * fp = fopen(PrecompiledFile.c_str(), "wb"); if (!fp) { ErrorMessage (Format("Cannot write to %s", PrecompiledFile.c_str())); return false; }; WriteFlexiaModels(fp, m_FlexiaModels); WriteAccentModels(fp, m_AccentModels); assert (!m_Prefixes.empty() && m_Prefixes[0].empty()); // do not write the first empty prefix, instead add it manually each time during loading fprintf(fp, "%i\n", m_Prefixes.size()-1); for (size_t i=1; i < m_Prefixes.size(); i++) fprintf (fp, "%s\n",m_Prefixes[i].c_str()); fprintf(fp, "%i\n", m_LemmaInfos.size()); if (!WriteVectorInner(fp, m_LemmaInfos)) return false; assert (m_NPSs.size() == m_FlexiaModels.size()); fprintf(fp, "%i\n", m_NPSs.size()); if (!WriteVectorInner(fp, m_NPSs)) return false; fclose(fp); if (!m_Bases.WriteShortStringHolder(MakeFName(GrammarFileName,"bases"))) { fprintf(stderr, "Cannot save bases\n"); return false; }; return true; } catch (...) { fprintf (stderr, "Cannot save CMorphDict"); return false; }; };