// ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Lemmatizer (www.aot.ru), // ========== Copyright by Alexey Sokirko (2004) // The main function of this header is CMorphAutomatBuilder::AddStringDaciuk // This function is an implementation of algorithm for constructing minimal, // deterministic, acyclic FSM from unordered set of strings, which was described // in "Jan Daciuk, Stoyan Mihov, Bruce Watson, and Richard Watson, // Incremental Construction of Minimal Acyclic Finite State Automata, // Computational Linguistics, 26(1), March 2000." #ifndef MorphAutomBuilder_h #define MorphAutomBuilder_h #include "MorphAutomat.h" struct CTrieNodeBuild; struct IsLessRegister: public less { bool operator ()(const CTrieNodeBuild* pNodeNo1, const CTrieNodeBuild* pNodeNo2) const; }; typedef set CTrieRegister; struct CTrieNodeBuild { bool m_bFinal; int m_IncomingRelationsCount; CTrieNodeBuild* m_Children[MaxAlphabetSize]; CTrieRegister::iterator m_pRegister; bool m_bRegistered; int m_NodeId; BYTE m_FirstChildNo; BYTE m_SecondChildNo; void Initialize(); void AddChild(CTrieNodeBuild* Child, BYTE ChildNo); void ModifyChild(CTrieNodeBuild* Child, BYTE ChildNo, bool bUpdateIncoming); CTrieNodeBuild* GetNextNode(BYTE RelationChar) const; void SetNodeIdNullRecursive (); void UnregisterRecursive(); // debug function bool CheckIncomingRelationsCountRecursive(map& Node2Incoming) const; void GetIncomingRelationsCountRecursive(map& Node2Incoming) const; bool CheckRegisterRecursive() const; void SetFinal(bool bFinal); }; class CMorphAutomatBuilder : public CMorphAutomat { CTrieNodeBuild* m_pRoot; // a hash by the letter of the first child and the number of children CTrieRegister m_RegisterHash[MaxAlphabetSize+1][MaxAlphabetSize+1]; vector m_Prefix; vector m_DeletedNodes; void ClearBuildNodes(); CTrieNodeBuild* CreateNode(); void DeleteNode(CTrieNodeBuild* pNode); CTrieNodeBuild* CloneNode(const CTrieNodeBuild* pPrototype); void UpdateCommonPrefix(const string& WordForm); CTrieNodeBuild* AddSuffix(CTrieNodeBuild* pParentNodeNo, const char* WordForm); CTrieNodeBuild* ReplaceOrRegister(CTrieNodeBuild* pNode); void DeleteFromRegister(CTrieNodeBuild* pNode); int GetFirstConfluenceState() const; void UnregisterNode(CTrieNodeBuild* pNode); CTrieRegister& GetRegister(const CTrieNodeBuild* pNode); // debug functions bool CheckRegister() const; bool IsValid() const; public: CMorphAutomatBuilder(MorphLanguageEnum Language); ~CMorphAutomatBuilder(); void InitTrie(); bool AddStringDaciuk(const string& WordForm); void ClearRegister(); void ConvertBuildRelationsToRelations(); }; #endif