// ========== This file is under LGPL, the GNU Lesser General Public Licence // ========== Dialing Graphematical Module (www.aot.ru) // ========== Copyright by Alexey Sokirko and Andrey Kalinin(1996-2001) #include "StdGraph.h" #include "assert.h" #include "HTMLConv.h" /* * Return offset... */ unsigned long HTML::getOffset(unsigned long off) { assert (m_bCollectOffsets); unsigned long cur_offset = 0; int i=0; for(; i < offsets.size(); i++) { cur_offset += offsets[i].high - offsets[i].low + 1; if(off <= cur_offset) break; } assert(i != offsets.size()); return offsets[i].high - (cur_offset - off); } /* * Add offset to the container */ void HTML::addOffset(unsigned long off) { if (!m_bCollectOffsets) return; if(offsets.empty()) offsets.push_back(offset_range(off, off)); else if(offsets.back().high == off - 1) offsets.back().high++; else offsets.push_back(offset_range(off, off)); } /*************************************************************************** * Very simple parser */ string HTML::GetTextFromHTMLBuffer(const char* Buffer, size_t BufferLen) { offsets.clear(); size_t cur_offset = 0, old_offset = 0; string result; string cur_tag, cur_amp; bool next_read = true; enum { normal = 0, tag, amp, spaces } state = normal; stack NotTextTags; BYTE ch; while (cur_offset < BufferLen) { if(next_read) { ch = (BYTE)Buffer[cur_offset]; cur_offset++; } else next_read = true; switch(state) { case normal: { if(isspace(ch)) { if (NotTextTags.empty()) { result += ' '; addOffset(cur_offset-1); } state = spaces; break; } else switch(ch) { case '<': state = tag; old_offset = cur_offset-1; cur_tag.erase(); break; case '&': state = amp; old_offset = cur_offset-1; cur_amp.erase(); break; default: if (NotTextTags.empty()) { result += ch; addOffset(cur_offset-1); }; } break; } case tag: { switch(ch) { case '>': state = normal; if(checkTag(cur_tag,"br")) { if (NotTextTags.empty()) { result += "\n"; addOffset(old_offset); }; state = spaces; } else if( checkTag(cur_tag,"xml") ) { NotTextTags.push("/xml"); } else // the tag itself can be very long for example"