#include "StdGramtab.h" #include "../common/util_classes.h" // ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Lemmatizer (www.aot.ru) // ========== Copyright by Alexey Sokirko #include "agramtab_.h" //#include "assert.h" static BYTE GetTagFromStr(const CAgramtab& A, const char* tab_str) { for (BYTE i = 0; i < A.GetPartOfSpeechesCount(); i++) if (!strcmp(tab_str, A.GetPartOfSpeechStr(i))) return i; return UnknownPartOfSpeech; } CAgramtabLine :: CAgramtabLine (size_t SourceLineNo) { m_SourceLineNo = SourceLineNo; }; bool CAgramtab::GetGrammems(const char* gram_code, QWORD& grammems) const { grammems = 0; if (gram_code == 0) return false; if (!*gram_code) return false; const CAgramtabLine* L = GetLine(s2i(gram_code)); if (L == NULL) return false; grammems = L->m_Grammems; return true; }; string CAgramtab::GrammemsToStr(QWORD grammems) const { char szGrammems[64*5]; grammems_to_str(grammems, szGrammems); return szGrammems; } //#pragma optimize( "", off ) bool CAgramtab :: ProcessPOSAndGrammems (const char* line_in_gramtab, BYTE& PartOfSpeech, QWORD& grammems) const { if (strlen(line_in_gramtab) > 300) return false; StringTokenizer tok(line_in_gramtab," ,\t\r\n"); const char* strPos = tok(); if (!strPos) { //printf ("unknown pos"); return false; }; // getting the part of speech if( strcmp("*", strPos) ) { PartOfSpeech = GetTagFromStr(*this, strPos); if (PartOfSpeech == UnknownPartOfSpeech) return false; } else PartOfSpeech = UnknownPartOfSpeech; // getting grammems grammems = 0; while ( tok() ) { size_t Count = GetGrammemsCount(); const char* grm = tok.val(); size_t i = 0; for (; i < Count; i++) if (!strcmp(grm, GetGrammemStr(i)) ) { grammems |= _QM(i); break; }; if ( (i == Count) && !IsAdditionalGrammem(grm)) return false; }; return true; }; //#pragma optimize( "", on ) bool CAgramtab::ProcessPOSAndGrammemsIfCan (const char* tab_str, BYTE* PartOfSpeech, QWORD* grammems) const { return ProcessPOSAndGrammems(tab_str, *PartOfSpeech, *grammems); }; static bool ProcessAgramtabLine (CAgramtab& A, const char* tab_str, size_t LineNo) { const char* s = tab_str+strspn (tab_str, " "); s += strcspn (s, " "); s += strspn (s, " "); s += strcspn (s, " "); s += strspn (s, " ");; return A.ProcessPOSAndGrammems(s, A.GetLine(LineNo)->m_PartOfSpeech, A.GetLine(LineNo)->m_Grammems); }; CAgramtab :: CAgramtab () { m_bInited = false; }; bool CAgramtab :: Read (const char * FileName) { if (FileName == NULL) return true; for (size_t i=0; im_Grammems; QWORD g2 = GetLine(s2i(debug))->m_Grammems; if( (g1 == g2) && (GetLine(i)->m_PartOfSpeech == GetLine(s2i(debug))->m_PartOfSpeech) ) { printf ("a double found %s (%s)", debug, i2s(i).c_str()); return false; }; } } fclose (fp); return true; }; bool CAgramtab ::GetPartOfSpeechAndGrammems(const BYTE* AnCodes, DWORD& Poses, QWORD& Grammems) const { size_t len = strlen((const char*)AnCodes); if (len == 0) return false; // grammems Grammems = 0; Poses = 0; for (size_t l=0; lm_PartOfSpeech); Grammems |= L->m_Grammems; }; return true; } CAgramtab :: ~CAgramtab () { }; int CAgramtab :: AreEqualPartOfSpeech (char *grm1, char* grm2) { if ((grm1==0) && (grm2==0)) return 1; if ((grm1==0) && (grm2!=0)) return 0; if ((grm2==0) && (grm1!=0)) return 0; if (((unsigned char)grm1[0] == '?') || ((unsigned char)grm2[0] == '?')) return 0; return GetLine(s2i(grm1))->m_PartOfSpeech == GetLine(s2i(grm2))->m_PartOfSpeech; } char* CAgramtab :: grammems_to_str (QWORD grammems, char* out_buf) const { out_buf[0] = 0; size_t GrammemsCount = GetGrammemsCount(); for (int i = GrammemsCount-1; i >=0; i--) if (_QM(i) & grammems) { strcat (out_buf, GetGrammemStr(i)); strcat (out_buf, ","); }; return out_buf; }; bool CAgramtab :: FindGrammems (const char* gram_codes, QWORD grammems) const { for (size_t l=0; lm_Grammems & grammems) == grammems) return true; return false; }; bool CAgramtab::GetGramCodeByGrammemsAndPartofSpeechIfCan(BYTE Pos, QWORD grammems, string& gramcodes) const { for (WORD i=0; im_Grammems == grammems) && (GetLine(i)->m_PartOfSpeech == Pos) ) { gramcodes = i2s(i); return true; } } return false; }; BYTE CAgramtab::GetPartOfSpeech(const char* gram_code) const { if (gram_code == 0) return UnknownPartOfSpeech; if (!strcmp(gram_code, "??")) return UnknownPartOfSpeech; const CAgramtabLine* L = GetLine(s2i(gram_code)); if (L == NULL) return UnknownPartOfSpeech; return L->m_PartOfSpeech; } size_t CAgramtab::GetSourceLineNo(const char* gram_code) const { if (gram_code == 0) return 0; if (!strcmp(gram_code, "??")) return 0; const CAgramtabLine* L = GetLine(s2i(gram_code)); if (L == NULL) return 0; return L->m_SourceLineNo; } QWORD CAgramtab::GetAllGrammemsWhichContains(const char *gram_code, QWORD ToIntersect) const { if (gram_code == 0) return 255; if (!strcmp(gram_code, "??")) return 255; size_t len = strlen (gram_code); QWORD grammems = 0; for (size_t l=0; lm_Grammems; if (G & ToIntersect) grammems |= G; }; return grammems; }; QWORD CAgramtab::GetAllGrammems(const char *gramcode) const { return GetAllGrammemsWhichContains(gramcode, GetMaxQWORD()); } bool CAgramtab::LoadFromRegistry () { try { Read(::GetRegistryString(GetRegistryString()).c_str()); return true; } catch (...) { return false; }; }; bool CAgramtab::LoadFromRegistryAndCheck () { try { return ReadAndCheck(::GetRegistryString(GetRegistryString()).c_str()); } catch (...) { return false; }; }; BYTE CAgramtab::GetFirstPartOfSpeech(const DWORD poses) const { BYTE Count = GetPartOfSpeechesCount(); for (BYTE i = 0; i < Count; i++) if (( poses & (1 <m_PartOfSpeech == pos) && ((grammems & L->m_Grammems) == grammems) ) Result += i2s(i); }; return Result; }; QWORD CAgramtab::Gleiche (GrammemCompare CompareFunc, const char* gram_codes1, const char* gram_codes2) const { QWORD grammems = 0; if (!gram_codes1) return false; if (!gram_codes2) return false; if (!strcmp(gram_codes1, "??")) return false; if (!strcmp(gram_codes2, "??")) return false; size_t len1 = strlen(gram_codes1); size_t len2 = strlen(gram_codes2); for (size_t l=0; lm_Grammems; QWORD G2 = GetLine(s2i(gram_codes2+m))->m_Grammems; if ( CompareFunc (G1, G2) ) grammems |= (G1 & G2); }; return grammems; }; // uses gleiche to compare ancodes from gram_codes1 with ancodes gram_codes2 // returns all ancodes from gram_codes1, which satisfy CompareFunc string CAgramtab::GleicheAncode1 (GrammemCompare CompareFunc, const char* gram_codes1, const char* gram_codes2) const { string Result; if (!gram_codes1) return ""; if (!gram_codes2) return ""; if (!strcmp(gram_codes1, "??")) return gram_codes2; if (!strcmp(gram_codes2, "??")) return gram_codes2; size_t len1 = strlen(gram_codes1); size_t len2 = strlen(gram_codes2); for (size_t l=0; lm_Grammems; for (size_t m=0; mm_Grammems; if ( CompareFunc (G1, G2) ) { //printf ("%s[%i]=%c\n",gram_codes1,l,gram_codes1[l]); Result.append(gram_codes1+l,2); //Result += gram_codes1[l+1]; break; }; }; }; return Result; }; string CommonAncodeAssignFunction(const CAgramtab* pGramTab, const string& s1, const string& s2) { string Result; size_t len1 = s1.length(); size_t len2 = s2.length(); for (size_t i=0; i