// ==========  This file is under  LGPL, the GNU Lesser General Public Licence
// ==========  Dialing Graphematical Module (www.aot.ru)
// ==========  Copyright by Alexey Sokirko (1996-2001)

#include "StdGraph.h"
#include  "../common/Graspace.h"
#include  "graline.h"
#include  "GraphmatFile.h"
#include "GraphanDicts.h"

const int NumberOfGraphematicalDescriptors  = 48;
const int ODesLen    = 9;                   /* the length of unit's descriptor */

const char DesArray[NumberOfGraphematicalDescriptors][ODesLen] = 
{
		     
    	     "BEG","RLE", "LLE","DEL","PUN",//the first position :  //0-4
		     "DC", "DSC","EA","_UNK_", // 5-8
		     //the second position :
		     "SPC","HYP","EOLN","EOP", //9-12
		     "PLP","AA", "aa", "Aa", // 13-16
			 "NAM?", "OPN","CLS", // 17-19
			 "EMSYM", "BUL", "INDENT", // 20-22
			 "PASS", "DPUN", "PAR_SYM", // 23-25

             // macrosyntax
             "CS?","CS","QUA","CS_PRNT","HDNG","CS_AUX","DOC", // 26-32

		     // pair descriptors : 
			 // if d is the first descriptor, then it should be always even, and d+1 should  be the corresponding second descriptor
		     "EXPR1","EXPR2", // 33-34
		     "FAM1","FAM2", // 35-36
             "RE1",  "RE2", // 37-38
			 "FILE1", "FILE2", // 39-40
			 "ABB1", "ABB2", // 41-42
			 "KEY1", "KEY2", // 43-44
			 "GDC1", "GDC2", // 45-46  // German divided compounds
			 "SENT_END" // 47

};


bool IsFirstMemberOfPairDesciptor(Descriptors d) 
{
	return		(d%2 == 0) 
			&&	(d >= (int)OEXPR1) 
			&&  (d < NumberOfGraphematicalDescriptors);
};
bool IsSecondMemberOfPairDesciptor(Descriptors d) 
{
	return		(d%2 != 0) 
			&&	(d >= (int)OEXPR2)
			&&  (d < NumberOfGraphematicalDescriptors);
};

Descriptors GetSecondMemberByTheFirst(Descriptors d) 
{
	assert   (IsFirstMemberOfPairDesciptor(d));
	return (Descriptors)(d+1);
};
Descriptors GetFirstMemberByTheSecond(Descriptors d) 
{
	assert   (IsSecondMemberOfPairDesciptor(d));
	return (Descriptors)(d-1);
};

bool GetDescriptorStr(int DescriptorNo, string& Result)
{
	if (DescriptorNo >= NumberOfGraphematicalDescriptors)
		return false;

	Result = DesArray[DescriptorNo];
	return true;
};

const char* GetDescriptorStr(int DescriptorNo)
{
	assert (DescriptorNo < NumberOfGraphematicalDescriptors);
	return DesArray[DescriptorNo];
};


/*
 all descriptors which end a text period like a sentence or a paragraph
*/
bool IsEndTextPeriodDescriptor (Descriptors d)
{
	return     (d == CS_Undef)
			|| (d == CS_Simple)
			|| (d == CS_Parent)
			|| (d == CS_Quasi)
			|| (d == CS_Heading)
			|| (d == CS_Explan)
			|| (d == OSentEnd);
};


CGraLine::CGraLine()  
{
	m_Status = 0;
	unit = NULL;
	m_Descriptors = 0;
	ulen = slen = 0;
	m_InputOffset = 0;
};

bool CGraLine :: HasMacroSyntaxDelimiter () const
{
	return	(		m_Descriptors 
				&	(		_QM(CS_Undef )
						|	_QM( CS_Simple )
						|	_QM( CS_Parent )
						|	_QM( CS_Heading )
						|	_QM( CS_Explan )
				    ) 
			) > 0;
};

bool CGraLine :: IsWordOrNumberOrAbbr() const
{
	return 
	(		m_Descriptors 
		&	(		_QM ( ORLE )
				|	_QM ( OLLE )
				|	_QM ( ODC )
				|	_QM ( ODSC )
				|	_QM ( OAbbr1 )
			)
	) != 0;
}


int get_descriptor_len ( const char * s)
{

	for (int i = 0; i< NumberOfGraphematicalDescriptors; i++)
	{
		size_t len = strlen (DesArray[i]);

		if (strncmp(s,DesArray[i],len) == 0)

		return strlen(DesArray[i]);
	}

	return 0;
}
 

bool CGraLine :: IsSingleSpaceToDelete() const
{
	return 		(GetTokenLength() == 1)
			&&  (GetToken()[0] == ' ') // we delete only space, leaving alone tabulations  
			&&	(m_Status == stSpace); // this line means that there is no other meaning for this 
										// space, for example, it prevents to consider a converted "</p" to be deleted 
};


void CGraLine :: MakeSpaces(size_t SpacesLength)
{
	m_Descriptors = _QM(ODel) | _QM(OSpc);
	m_Status = stSpace;
	ulen = SpacesLength;
	slen = SpacesLength;
	memset(const_cast<char*>(unit), ' ', SpacesLength);
};


int  CGraLine :: ToInt () const 
{	
	char s[100];
	strncpy (s,unit,ulen);
	s[ulen] = 0;
	return atoi (s);
};


bool CGraLine::IsNotPrint  () const 
{
	return (m_Status & stNotPrint) != 0;
};
bool CGraLine::IsEnglishName () const 
{
	return (m_Status & stEnglishName) != 0;
};
bool CGraLine::IsIdent () const 
{ 
	return (m_Status & stIdent) != 0; 
};;

bool CGraLine::IsGrouped  () const 
{ 
	return (m_Status & stGrouped) != 0;
};
bool CGraLine::IsAbbreviation  () const 
{ 
	return (m_Status & stAbbreviation) != 0;
};
bool CGraLine::IsParagraphChar () const  
{ 
	return (m_Status & stParagraphChar) != 0;
};
bool CGraLine::IsPageBreak() const  
{ 
	return (m_Status & stPageBreak) != 0;
};
bool CGraLine::IsTextAreaEnd() const  
{ 
	return (m_Status & stTextAreaEnd) != 0;
};

bool CGraLine::IsParagraphTag () const  
{ 
	return (m_Status & stParagraphTag) != 0;
};

bool CGraLine::IsKeyModifier()  const 
{ 
	return (m_Status & stKeyModifier) != 0;
};
bool CGraLine::IsElectronicAddress()  const 
{ 
	return (m_Status & stElectronicAddress) != 0;
};
bool CGraLine::IsChar (int c) const  
{ 
	return (ulen == 1) && (unit[0] == c);
};
bool CGraLine::IsAsterisk () const 
{ 
	return   IsChar((unsigned char)'*') || IsChar(149);
};

bool CGraLine::HasSingleSpaceAfter()  const { return (m_Status & stSingleSpaceAfter) > 0;};


bool CGraLine::IsString (const char* s) const 
{ 
	return (s[ulen] == 0) && (!strncmp(unit,s,ulen));
};
void CGraLine::SetSpace ()  
{
	m_Status |=  stSpace;
};
void CGraLine::SetEOLN  ()  
{ 
	m_Status |=  stEOLN;
};
void CGraLine::SetNotPrint()  
{ 
	m_Status |=  stNotPrint;
};
void CGraLine::SetPunct ()  
{ 
	m_Status |=  stPunct;
};

void CGraLine::SetParagraphChar()  
{ 
	m_Status |=  stParagraphChar;
};
void CGraLine::SetParagraphTag()  
{ 
	m_Status |=  stParagraphTag;
};
void CGraLine::SetKeyModifier()  
{ 
	m_Status |=  stKeyModifier;
};
void CGraLine::SetElectronicAddress()  
{ 
	m_Status |=  stElectronicAddress;
};


void CGraLine::SetSingleSpaceAfter()  
{
	m_Status |=  stSingleSpaceAfter;
};
void CGraLine::SetIdent()  
{ 
	m_Status |=  stIdent;
};
void CGraLine::SetPageBreak()  
{ 
	m_Status |=  stPageBreak;
};
void CGraLine::SetTextAreaEnd()  
{ 
	m_Status |=  stTextAreaEnd;
};
void CGraLine::SetEnglishName () 
{ 
	m_Status |=  stEnglishName;
};

void CGraLine::DelDes(Descriptors d)  
{ 
	m_Descriptors &=  ~( _QM(d) );
};


void CGraLine::SetDes(Descriptors d)  
{
	m_Descriptors |= _QM(d);
};


void CGraLine::AddStatus(WORD add_state)
{
	m_Status |= add_state;
};

void CGraLine::AddLength(const CGraLine& L)
{
	ulen +=  L.GetTokenLength();
	slen += L.GetScreenLength();
};


void CGraLine::SetToken(const char* s)
{
	unit = s;
};


size_t GetInternetAddressStarter (const char *s)
{
	if (!strncmp(s, "http://",strlen("http://"))) return strlen("http://");
	if (!strncmp(s, "HTTP://",strlen("http://"))) return strlen("http://");

	if (!strncmp(s, "ftp://",strlen("ftp://"))) return strlen("ftp://");
	if (!strncmp(s, "FTP://",strlen("ftp://"))) return strlen("ftp://");

	if (!strncmp(s, "ftp.",strlen("ftp."))) return strlen("ftp.");
	if (!strncmp(s, "FTP.",strlen("ftp."))) return strlen("ftp.");

	if (!strncmp(s, "www.",strlen("www."))) return strlen("www.");
	if (!strncmp(s, "WWW.",strlen("www."))) return strlen("www.");

	if (!strncmp(s, "www2.",strlen("www2."))) return strlen("www2.");
	if (!strncmp(s, "WWW2.",strlen("www2."))) return strlen("www2.");

	return 0;
}


size_t CGraLine::LengthUntilDelimiters (const char *s, const CGraphmatFile* G)
{
	bool bElectronicAddress = GetInternetAddressStarter(s) > 0;
	int i = 0;
	for (i=0; i<CriticalTokenLength; i++)
	{

		if (is_alpha((BYTE) s[i])) 
		{
			if (i == 0) // prohibit apostrophe at the first position
				if (s[i] == Apostrophe)
					break;
			continue;
		};


		if (isdigit((BYTE) s[i])) continue;

		if (G->m_pDicts->IsRegisteredKeyModifier(s, i))
		{
			SetKeyModifier();	 
			break;
		};

		if (i > 0)
		{
			if (s[i] == '-') continue; //  let an inner hyphen be part of the word, for example "test-test"
			if ((s[i] == '.') && (i+1 < CriticalTokenLength))
			{	
				if (		(isdigit((BYTE)s[i-1]) == isdigit((BYTE)s[i+1]))
						&&	(		(G->m_Language != morphRussian)	// prohibit  "." as a word part  for Russian 
								||	!is_russian_alpha((BYTE)s[i-1])  // for example: г.Самара, В.И.Ленин
								||	!is_russian_alpha((BYTE)s[i+1])
							)
					)
				continue; //  if "." delimits alphas or digits, let an inner full stops be part of the word, for example "www.lenta.ru" or 1.12.12
						  // we exclude cases, if the full stop delimits digits and alphas, since it can lead to tokenization errors, for example such "1.We go to the north;2.We go to the south;"
			};

			if (s[i] == '/') continue; //  let an inner slash be part of the word, for example "TCP/IP"
			if (s[i] == '_') continue; //  let an underscore be part of the word, for example "al_sokirko"
		};

		if (bElectronicAddress)
		{
			if (s[i] == '.')  continue;
			if (s[i] == '_')  continue;
			if (s[i] == '/')  continue;
			if (s[i] == '\\')  continue;
			if (s[i] == ':')  continue;
		};
		if (s[i] == '@')
		{
			// sokirko@medialingua.ru
			if (i+1 < CriticalTokenLength)
			{
				if (is_alpha((BYTE) s[i+1])||isdigit((BYTE) s[i+1]) ) 
				{
					bElectronicAddress = true;
					continue;
				};
			};

		};

		break;
	};
	//  exclude the last full stop or  slash from the word,
	//  for example we  do not consider "Israel/" as one word, but as two tokens "Israel" and "/". 
	//  but  do not exclude the last hyphen, cf German examples:
	//   "Reichsfinanz- und Reichsinnenministers"

	for ( ;i>0 && (s[i-1]=='.' || s[i-1]=='/' || s[i-1]==':' || s[i-1]=='\'');   i--);

	if (i == 0)
		return 1; // return 1, if this is not a word and not a number
	else
	{
		if (bElectronicAddress)
			SetElectronicAddress();	 
		
		return i;
	};
}


// читает из буфера b в структуру L  
size_t CGraLine::ReadWord (size_t Offset, const CGraphmatFile* G, DWORD& PageNumber)
{
	PageNumber = UnknownPageNumber;
	const char *s;
	const char* In = (const char*)&(G->GetInputBuffer()[0]);
	char *Out = const_cast<char*>(unit);
	BYTE len;
	ulen = slen = 0;
	

	m_InputOffset = Offset;


	/*   If Carriage Return occurs ...*/
	if (In[Offset] == '\r')
	{
		if (In[Offset+1] != '\n')
		{
			if (G->m_bFilterUnprintableSymbols)
			{
				Out[0] = ' ';
				SetNotPrint();
			}
			else
			{
				Out[0] = In[Offset];
				SetPunct();
			};
			Offset++;
			slen = 1;
			ulen = 1;
		}
		else
		{
			do
			{
				Out[ulen] = '\r';
				Out[ulen+1] = '\n';
				if (In[Offset+1] != '\n') break;
				ulen += 2;
				slen++;
				Offset+=2;
			}
			while ((In[Offset]=='\r') && (ulen < CriticalTokenLength));
			SetEOLN  ();
		}
	}
	else
	if (In[Offset] == '\n')
	{
		do
		{
			Out[ulen] = '\n';
			ulen += 1;
			Offset++;
			slen++;

		}
		while ((In[Offset] == '\n') && (ulen < CriticalTokenLength));
		SetEOLN  ();
	}
	else
	/* if it is a "</p>" (comes  from html) */
	if (		G->m_bUseParagraphTagToDivide
			&&	!strncmp(In+Offset,"</p>",4)
	)
	{
		memset (Out, ' ', 4);
		ulen = 4;
		slen = 1;
		SetSpace();
		SetParagraphTag();
		assert (IsParagraphTag());
		Offset += 4;
	}
	else
	// &nbsp
	if (		!strncmp(In+Offset,"&nbsp;",6) 
			||	!strncmp(In+Offset,"&NBSP,",6) 
	)
	{
		
		while	(			!strncmp(In+Offset,"&nbsp;",6) 
						||	!strncmp(In+Offset,"&NBSP,",6)
				)
		{
			if (ulen+6 >= CriticalTokenLength) break;
			memset (Out+ulen, ' ', 6);
			ulen += 6;
			slen++;
			Offset += 6;
			
		};
		SetSpace();

	}
	else
	// <br>
	if (		!strncmp(In+Offset,"<br>;",4) 
			||	!strncmp(In+Offset,"<BR>,",4) 
	)
	{
		Out[ulen] = '\n';
		ulen += 1;
		slen += 1;
		SetEOLN  ();
		Offset += 4;
	}
	else
	// <pb 1>
	if	(!strncmp(In+Offset,"</textarea>",strlen("</textarea>")))
	{
		SetTextAreaEnd();
		SetSpace();
		int l = strlen("</textarea>");
		memset (Out, ' ', l);
		ulen += l;
		slen += l;
		Offset += l;
	}
	else
	// <pb 1>
	if	(		!strncmp(In+Offset,"<pb ",3) 
			&&	isdigit((BYTE)In[Offset+4])
		)
	{
		sscanf(In+Offset+4, "%u", &PageNumber);
		

		SetSpace();
		SetPageBreak();

		memset (Out, ' ', 4);
		ulen += 4;
		slen += 4;
		Offset += 4;
		while	(			isdigit(In[Offset]) 
						||	isspace(In[Offset])
				)
		{
			Out[ulen] = ' ';
			ulen ++;
			slen ++;
			Offset++;
		};
		if (In[Offset] == '>')
		{
			Out[ulen] = ' ';
			ulen ++;
			slen ++;
			Offset++;
		}
	}
	else
	/*  if TCP/IP occurs ..*/
	if (G->m_pDicts->FindInIdents ( In+Offset, len ))
	{
		slen = len;
		ulen = len;
		memcpy(Out,In+Offset,ulen);
		Offset += ulen;
		SetIdent  ();
	}
	else
	/*  if a Bracket occurs ..*/
	if (isbracket((BYTE)In[Offset]))
	{
		*Out = In[Offset];
		slen = ulen=1;
		Offset++;
		SetPunct();
	}
	else
	/* if a Space or Tabulation occurs ... */
	if (isnspace((BYTE)In[Offset]))
	{
		for (;
				isnspace (In[Offset]) && ulen <CriticalTokenLength;
				ulen ++, slen += (In[Offset] == '\t') ? G->m_TabSize : 1, Offset++
			)
			Out[ulen] = In[Offset];

		SetSpace();
	}
	/*else
		if (	(G->m_Language == morphEnglish)
			&&	(In[Offset] == '\'') 
			&& (		(In[Offset+1] == 's')  
					||	(In[Offset+1] == 'S')
				) 
			&& (Offset>0)
			&& is_english_alpha((BYTE)In[Offset-1])
		)
	{
		Out[ulen] = In[Offset];
		Out[ulen+1] = In[Offset+1];
		ulen =2;
		slen = 2;
		Offset+=2;
		
	}*/
	else 
	/*
	последовательность из восклицательных и вопросительных знаков 
	(используется как конец предложения) 
	*/
	if (		((BYTE)In[Offset] == '?')  
			||	((BYTE)In[Offset] == '!') 
		)
	{
		for (	int ch1 = In[Offset];
				(		(In[Offset] == '?')  
					||	(In[Offset] == '!') )
				&& (ulen < CriticalTokenLength);
				Offset++,
				slen++,ulen++
			)
			Out[ulen] = In[Offset];

		SetPunct();
	}
	else
	/* if it is a hard delimiter */
	if (ispunct((BYTE)In[Offset]) || is_pseudo_graph((BYTE)In[Offset]))
	{
		for (	int ch1 = In[Offset];
				(In[Offset] == ch1) && (ulen < CriticalTokenLength);
				Offset++,
				slen++,ulen++
			)
		Out[ulen] = ch1;

		SetPunct();
	}

	else
	/*  If it is not printable symbol and if the non-printable symbols should be filtered */
	if (		((BYTE) In[Offset] < 32)
			||	((BYTE) In[Offset] == cIonChar)
			||	((BYTE) In[Offset] == cNumberChar)
			||	((BYTE) In[Offset] == cPiChar)
			||	((BYTE) In[Offset] == cCompanyChar)
			||	((BYTE) In[Offset] == cEllipseChar)
		)
	{
		if (G->m_bFilterUnprintableSymbols || !In[Offset])
		{
			Out[0] = ' ';
			SetNotPrint();
		}
		else
		{
			Out[0] = In[Offset];
			SetPunct();
		};

		if ((BYTE)In[Offset] == cParagraph)
			SetParagraphChar();	 

		Offset++;
		slen = 1;
		ulen = 1;
	}
	else
	/* If a word, number or something else  occurs ...*/
	{
		int WordLength;

		bool bCanBeSpaceDelimitedWord = 		
								( (G->GetInputBuffer().size() - Offset) > 2) 
							&&	is_spc_fill(In[Offset+1]);

		s =  bCanBeSpaceDelimitedWord ?  G->m_pDicts->SearchSpace (In+Offset,&WordLength) : NULL;

		if (s!=NULL)
		{
			// spaced words
			ulen = strlen(s);
			strncpy(Out,s,ulen);
			Offset += WordLength;
		}
		else
		{
			const char* CurrIn = In+Offset; 
			
			WordLength = LengthUntilDelimiters (CurrIn,G);
			

			// sequence N% is one token)
			if (WordLength == 1)
				if ((BYTE)CurrIn[WordLength-1] == 'N')	
					if (Offset + WordLength < G->GetInputBuffer().size())
						if ((BYTE)CurrIn[WordLength] == '%')
							{
								WordLength++;
							};

			if (Offset + WordLength >=  G->GetInputBuffer().size())
				WordLength = G->GetInputBuffer().size() -  Offset;

			strncpy(Out,CurrIn,WordLength);

			slen = ulen = WordLength;
			Offset += WordLength;
		}
	};

	return Offset;  
 }