// ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Graphematical Module (www.aot.ru) // ========== Copyright by Alexey Sokirko (1996-2001) #include "StdGraph.h" #include "UnitHolder.h" //======================================================= //========== CUnitHolder ============= //======================================================= CUnitHolder::CUnitHolder() { m_Language = morphRussian; }; void CUnitHolder::SetState (size_t LB, size_t HB, WORD state) { for (size_t i=LB; i= HB) return HB; for ( ; (i= HB) return HB; for ( ; (i= HB) return HB; for ( ; (i= HB) return HB; for ( ; (iLB) && m_Units[i].IsSpace(); i--); return i; } size_t CUnitHolder::PSoft (size_t i, size_t HB) const { if (i >= HB) return HB; for ( ;(i0) && m_Units[i].IsSoft(); i--); return i; } bool CUnitHolder::IsQuestionOrExclamationMarks (size_t i) const { return ( (m_Units[i].GetTokenLength() > 0) && ( ((unsigned char)m_Units[i].GetToken()[0] == '?') || ((unsigned char)m_Units[i].GetToken()[0] == '!') ) ); }; bool CUnitHolder::IsSentenceEndMark (size_t i) const { return IsOneFullStop(i) || IsQuestionOrExclamationMarks(i) || ((m_Units[i].GetTokenLength() == 1) && ((BYTE)m_Units[i].GetToken()[0] == cEllipseChar)) || ( (m_Units[i].GetTokenLength() >1) && (m_Units[i].GetTokenLength() <6) && ((unsigned char)m_Units[i].GetToken()[0] == '.') ) || m_Units[i].IsTextAreaEnd(); }; bool CUnitHolder::IsOneCloseQuotationMark (size_t i) const { if (i == 0) return false; BYTE z = (BYTE)m_Units[i].GetToken()[0]; return (m_Units[i].GetTokenLength() == 1) && ( ( z == (BYTE)'"') || ( z == (BYTE)'»') ); }; bool CUnitHolder::IsOneOpenQuotationMark (size_t i) const { if (i == 0) return false; BYTE z = (BYTE)m_Units[i].GetToken()[0]; return (m_Units[i].GetTokenLength() == 1) && ( (z == (BYTE)'"') || (z == (BYTE)'«') ); }; void CUnitHolder::SetDes(size_t x, Descriptors des) { m_Units[x].SetDes(des); }; bool CUnitHolder::IsHyphen(size_t x) const { return HasDescr(x,OHyp) && m_Units[x].GetTokenLength() == 1; }; bool CUnitHolder::is_latin_alpha (int ch) const { if (m_Language == morphGerman) return is_german_alpha(ch); else return is_english_alpha(ch); }; bool CUnitHolder::is_lowercase (int ch) const { if (m_Language == morphGerman) return is_german_lower(ch); else return is_russian_lower(ch) || is_english_lower(ch); }; bool CUnitHolder::is_uppercase (int ch) const { if (m_Language == morphGerman) return is_german_upper(ch); else return is_russian_upper(ch) || is_english_upper(ch); }; bool CUnitHolder::StrSuperCompare (int UnitNo, const char* s) const { return (s[m_Units[UnitNo].GetTokenLength()] == 0) && !strscmp(m_Units[UnitNo].GetToken(), s, m_Units[UnitNo].GetTokenLength(), m_Language); }; bool CUnitHolder::IsOneAlpha(size_t x) const { return ((HasDescr(x,ORLE) || HasDescr(x,OLLE)) && (m_Units[x].GetTokenLength()==1)) || ( (m_Units[x].GetTokenLength()==1) && is_latin_alpha((unsigned char)m_Units[x].GetToken()[0]) ); }; bool CUnitHolder::IsOneChar(size_t x, int i) const { return m_Units[x].GetTokenLength() == 1 && m_Units[x].GetToken()[0] == i; }; bool CUnitHolder::FirstUpper(size_t x) const { return HasDescr(x,OUp) || HasDescr(x,OUpLw); }; bool CUnitHolder::IsBulletWord (size_t x) const { return HasDescr(x,ODC) || IsOneAlpha(x); }; bool CUnitHolder::IsOneFullStop (size_t i) const { return (m_Units[i].GetTokenLength() == 1) && (m_Units[i].GetToken()[0] == '.'); }; // i - индекс графемы в самом начале строки. // Смещаемся до начала строки и смотрим, стоит ли перед этой строкой // пустая строка. bool CUnitHolder::EmptyLineBeforeGraph (size_t i, size_t HB) const { if ((i == 0) || m_Units[i].IsSoft()) return false; size_t k = BSpace (i-1,0); if ((k == 0) || (!m_Units[k].IsEOLN())) return false; if ( (m_Units[k].GetTokenLength()>2) || ( (m_Units[k].GetTokenLength() == 2) && (m_Units[k].GetToken()[0] == '\n') ) ) return true; k--; if ( k == 0 ) return false; k = BSpace ( k,0 ); return m_Units[k].IsEOLN(); }; void CUnitHolder::FreeTable() { ClearVector(m_Units); ClearVector(m_TokenBuf); m_FoundOborots.clear(); m_FoundPageBreaks.clear(); } void CUnitHolder :: BuildUnitBufferUpper () { m_UnitBufUpper.clear(); for (int i = 0; i& CUnitHolder::GetUnitBuf() const { return m_TokenBuf; }; const vector& CUnitHolder::GetInputBuffer() const { return m_InputBuffer; }; void CUnitHolder::AddUnit(const CGraLine& NewLine) { m_Units.push_back(NewLine); }; void CUnitHolder::ClearPairDescriptors(size_t StartLineNo, size_t EndLineNo) { for (size_t LineNo=StartLineNo; LineNoLineNo-20; i--) if (HasDescr(i, dual_descr)) { DeleteDescr(i,dual_descr); break; }; // we should find the dual descriptor in this range assert (i > LineNo-20); DeleteDescr(LineNo,(Descriptors)des); }; }; void CUnitHolder::MakeOneWord(size_t StartLineNo, size_t EndLineNo) { if (StartLineNo+1 == EndLineNo) return; // delete all pair descriptors which intersects with [StartLineNo,EndLineNo) ClearPairDescriptors(StartLineNo, EndLineNo); // move "end of sentence" descriptors from all lines of [StartLineNo,EndLineNo) to StartLineNo for (size_t LineNo=StartLineNo+1; LineNo(m_Units[StartLineNo].GetToken()) + m_Units[StartLineNo].GetTokenLength(); strncpy(out, m_Units[LineNo].GetToken(), m_Units[LineNo].GetTokenLength()); GetUnit(StartLineNo).AddLength(m_Units[LineNo]); char* upper_out= const_cast(GetUppercaseToken(StartLineNo)); strcat(upper_out, GetUppercaseToken(LineNo)); } else SpacesLength += m_Units[LineNo].GetTokenLength(); }; // if some spaces were found in (StartLineNo, EndLineNo) // then fill line StartLineNo+1 with the spaces if (SpacesLength > 0) { m_Units[StartLineNo+1].SetToken(m_Units[StartLineNo].GetToken()+m_Units[StartLineNo].GetTokenLength()); m_Units[StartLineNo+1].MakeSpaces(SpacesLength); { // update upper buffer char* upper_out= const_cast(GetUppercaseToken(StartLineNo+1)); strncpy(upper_out, m_Units[StartLineNo+1].GetToken(), m_Units[StartLineNo+1].GetTokenLength()); upper_out[m_Units[StartLineNo+1].GetTokenLength()] = 0; }; SetOborotNo(StartLineNo+1,-1); if (EndLineNo-StartLineNo > 2) { // erasing NULLs from the upper m_UnitBufUpper, which were addeded after each line size_t upper_rest_offset = GetUppercaseToken(StartLineNo+2) - GetUnitUpperBufferStart(); m_UnitBufUpper.erase(m_UnitBufUpper.begin()+upper_rest_offset,m_UnitBufUpper.begin()+upper_rest_offset+(EndLineNo-StartLineNo-2)); }; // deleting (StartLineNo+2, EndLineNo) m_Units.erase(m_Units.begin()+StartLineNo+2, m_Units.begin() + EndLineNo); } else { // deleting (StartLineNo+1, EndLineNo) m_Units.erase(m_Units.begin()+StartLineNo+1, m_Units.begin() + EndLineNo); // erasing NULLs from the upper m_UnitBufUpper, which were addeded after each line size_t upper_rest_offset = GetUppercaseToken(StartLineNo+1) - GetUnitUpperBufferStart(); m_UnitBufUpper.erase(m_UnitBufUpper.begin()+upper_rest_offset,m_UnitBufUpper.begin()+upper_rest_offset+(EndLineNo-StartLineNo-1)); } // check uppercase buffer if (StartLineNo+2 < m_Units.size()) { const char* check = GetUppercaseToken(StartLineNo+2); string s = GetToken(StartLineNo+2); RmlMakeUpper(s, m_Language); assert(check == s); }; }; void CUnitHolder::SetOborotNo(size_t LineNo, short OborotNo) { if (OborotNo == -1) m_FoundOborots.erase(m_Units[LineNo].GetInputOffset()); else m_FoundOborots[m_Units[LineNo].GetInputOffset()] = OborotNo; } short CUnitHolder::GetOborotNo(size_t LineNo) const { if (LineNo == 0) return -1; map::const_iterator it = m_FoundOborots.find(m_Units[LineNo].GetInputOffset()); if ( it == m_FoundOborots.end() ) return -1; else return it->second; } void CUnitHolder::SetPageNumber(size_t LineNo, DWORD PageNumber) { if (PageNumber == UnknownPageNumber) m_FoundPageBreaks.erase(m_Units[LineNo].GetInputOffset()); else m_FoundPageBreaks[m_Units[LineNo].GetInputOffset()] = PageNumber; } DWORD CUnitHolder::GetPageNumber(size_t LineNo) const { map::const_iterator it = m_FoundPageBreaks.find(m_Units[LineNo].GetInputOffset()); if ( it == m_FoundPageBreaks.end() ) return UnknownPageNumber; else return it->second; }