/////////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	LineAnalysisPass.c
  Purpose:	This pass collects useful info about the lines in the 
  		input file.  This data is later combined heuristically 
		to (hopefully) answer such questions as whether the line 
		is a header, where paragraphs
		begin and end, and so forth.
  Mods:		09/01/01 RSB	Began.
  		09/03/01 RSB	Altered definition of all-caps to include
				McSOMETHING, MacSOMETHING, DeSOMETHING.
		09/15/01 RSB	... and unfortunately introduced a bug in 
				which the NEXT character was always being 
				checked by IsLower!  Fixed this, and allowed 
				IsLower to take care of the lower-case 
				characters in "8-bit" ASCII.  Now allow 
				things like "(A" to count as VerseCap.
  		11/02/01 RSB	Added GPL disclaimer and reformatted 
				somewhat for first web release.
		12/13/01 RSB	Added WeirdSequences.
  
  The lines which are analyzed are not part of any header located previously.
  In other words, we begin with "line 0" at Dataset->TextStart.
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "AutoMark.h"
#ifdef __BORLANDC__
#define strncasecmp strncmpi
#endif

// Strings that may be at the fronts of names that don't affect the 
// capitalization status of the name.
const char *NamePrefixes[] = {
  "Mac", "Mc", "De", "Di", "Da", "La", "Le", "Van", "Von"
};
#define NUM_NAME_PREFIXES (sizeof (NamePrefixes) / sizeof (NamePrefixes[0]))
int NamePrefixSizes[NUM_NAME_PREFIXES];

//----------------------------------------------------------------------
// This is a replacement for fgets, in which a form feed can terminate a 
// line.

char *
rfgets (char *s, int size, FILE * fp)
{
  int ch;
  char *RetVal;
  RetVal = s;
  while (size)
    {
      ch = getc (fp);
      if (ch == EOF)
	{
	  *s = 0;
	  return (NULL);
	}
      if (ch == '\f')
	ch = '\n';
      *s++ = ch;
      size--;
      if (ch == '\n')
	break;
    }
  if (size)
    *s = 0;
  return (RetVal);
}

//------------------------------------------------------------------------
// Checks if a character is one of the roman-numeral characters.  
// We don't check for 'D' or 'M' because our entire focus is on detecting 
// chapter numbers or book numbers.  While there are books with over 100 
// chapters, I don't think  there are any or man with over 500.
static int
isroman (char c)
{
  c = toupper (c);
  return (c == 'I' || c == 'V' || c == 'X' || c == 'L' || c == 'C');
}

//-----------------------------------------------------------------------
// Checks for characters unlikely to appear at the end of a header.
// Admittedly, some of these characters can appear, but ...
static int
IsEndPeriod (char c)
{
  return (ispunct (c) && c != ')' /* && c != '\"' && c != '\'' */ );
}

//----------------------------------------------------------------------
// Checks for lower-case letter at position sss in string s.  Exceptions 
// are made so that "McSOMETHING", "MacSOMETHING", "DeSOMETHING" and 
// maybe others aren't counted as lower case.  Note that as given, the 
// code only works for prefixes that actually begin with an upper-case 
// letter followed by lower-case letter(s).

#define IsPureUpper(c) ( \
  ( ((unsigned char) (c)) >= 'A' && ((unsigned char) (c)) <= 'Z' ) \
  || ( ((unsigned char) (c)) >= 192 && ((unsigned char) (c)) <= 222 ) \
)
static int
IsLower (char *s, char **sss)
{
  int i;
  char c;
  unsigned uch;
  uch = (unsigned char) **sss;
  if ((uch >= 'A' && uch <= 'Z') && ((*sss)[1] >= 'a' && (*sss)[1] <= 'z'))
    if (*sss == s || isspace ((*sss)[-1]) || (*sss)[-1] == '-')
      {
	for (i = 0; i < NUM_NAME_PREFIXES; i++)
	  if (0 == strncmp (*sss, NamePrefixes[i], NamePrefixSizes[i]))
	    {
	      c = (*sss)[NamePrefixSizes[i]];
	      if (IsPureUpper (c))
		{
		  (*sss) += NamePrefixSizes[i] + 1;
		  return (0);
		}
	    }
      }
  (*sss)++;
  return ((uch >= 'a' && uch <= 'z') || (uch >= 223 && uch <= 255));
}

//---------------------------------------------------------------------
// Detects that the input line begins with certain special words.

static const char *SpecialWords[] = {
  "Book", "Chapter", "Chap", "Part",
  "footnotes", "footnotes:", "endnotes", "endnotes:", "index", "introduction",
  "preface",
  "foreward", "summary", "conclusion", "appendix",
  "prologue", "prolog", "epilogue", "epilog", "glossary", "about the author",
  "advertisement", "advertisement.", "synopsis", "executive summary",
  "recommendation",
  "recommendations", "letter", "story"
};
#define NUM_SPECIALWORDS (sizeof (SpecialWords) / sizeof (SpecialWords[0]))
static int SpecialWordSizes[NUM_SPECIALWORDS];
static int SpecialWordsInitialized = 0;
int
MatchSpecialWords (AnalysisDataset * Dataset, char *ss, LineRecord * Line)
{
  int i;
  if (!SpecialWordsInitialized)
    {
      for (i = 0; i < NUM_SPECIALWORDS; i++)
	SpecialWordSizes[i] = strlen (SpecialWords[i]);
      SpecialWordsInitialized = 1;
    }
  for (; isspace (*ss); ss++);
  if (strncasecmp (ss, "contents", 8) == 0
      || strncasecmp (ss, "table of contents", 17) == 0)
    Line->Contents = 1;
  for (i = 0; i < NUM_SPECIALWORDS; i++)
    if (0 == strncasecmp (ss, SpecialWords[i], SpecialWordSizes[i])
	&& isspace (ss[SpecialWordSizes[i]]))
      {
	Line->BeginsChapter = 1;
	return (1);
      }
  return (0);
}

//-----------------------------------------------------------------------
// Returns 0 on success.

int
LineAnalysisPass (AnalysisDataset * Dataset)
{
  unsigned long Offset;
  char s[AUTOMARK_MAX_LINESIZE], *ss, *sss, *siv;
  LineRecord Line, DefaultLine = { 0 };
  int ReturnValue = 1;
  int i, Leading;

  // Set up the name-prefix array.
  for (i = 0; i < NUM_NAME_PREFIXES; i++)
    NamePrefixSizes[i] = strlen (NamePrefixes[i]);
  Offset = Dataset->TextStart;
  s[sizeof (s) - 1] = '\0';
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  while (NULL != rfgets (s, sizeof (s) - 1, Dataset->InputFile))
    {
      Line = DefaultLine;

      // First record the offset of the line.
      Line.Offset = Offset;
      Offset = ftell (Dataset->InputFile);

      // Now, miscellaneous other info. 
      if (*s == '\0')
	Line.Empty = 1;
      else
	{

	  // Check for possible tabular data.  Recognized by multiple 
	  // whitespace between non-white characters.  This is defined 
	  // as at least 3 whites, or at least 4 whites following 
	  // punctuation.
	  for (ss = s; isspace (*ss); ss++);	// ss -> first non-space.
	  for (sss = ss + 1; *sss; sss++);	// find trim end of line too.
	  for (sss--; sss > s && isspace (*sss); sss--);	// nonspace
	  for ( /*ss */ ; ss <= sss; ss++)
	    if (isspace (*ss) && isspace (ss[1]) && isspace (ss[2])
		&& (!ispunct (ss[-1]) || isspace (ss[3])))
	      {
		Line.PossibleTable = 1;
		break;
	      }

	  // Check out the condition of white space at the beginning of 
	  // the line.
	  if (isspace (*s))
	    Line.BeginsWhite = 1;
	  Leading = 0;
	  for (ss = s; isspace (*ss); ss++)
	    if (*ss == ' ')
	      Leading++;
	    else if (*ss == '\t')
	      Leading = (Leading + 8) & ~7;
	    else if (*ss == '\f')
	      /* Line.FormFeedPrior = 1 */ ;
	  Line.WeirdSequences = (*ss == '|');
	  Line.BeginsQuote = (*ss == '\'' || *ss == '\"');
	  if (Leading < 64)
	    Line.Leading = Leading;
	  else
	    Line.Leading = 63;
	  if (*ss == '\0')
	    Line.Empty = 1;
	  else
	    {
	      Line.RaggedStart = (Leading > 8 && 0 != (Leading & 7)
				  && 0 != (Leading % 5));

	      // Check the various capitalization things.
	      sss = ss;
	      if (*sss == '-' && sss[1] == '-')
		{
		  sss += 2;
		  while (isspace (*sss))
		    sss++;
		}

	      // Note that could have "( or (". 
	      if (*sss == '\"' || *sss == '\'' || *sss == '(' || *sss == '{')
		sss++;
	      if (*sss == '\"' || *sss == '\'' || *sss == '(' || *sss == '{')
		sss++;
	      if (IsPureUpper (*sss))
		Line.VerseCap = 1;
	      if (IsPureUpper (*ss))
		{
		  Line.CapFirstChar = 1;
		  Line.VerseCap = 1;
		  Line.CapFirstWord = 1;
		  for (sss = ss; isalpha (*sss);)
		    if (IsLower (s, &sss))
		      {
			Line.CapFirstWord = 0;
			break;
		      }
		  if (Line.CapFirstWord)
		    {
		      Line.AllCaps = 1;
		      for (; *sss;)
			if (IsLower (s, &sss))
			  {
			    Line.AllCaps = 0;
			    break;
			  }
		    }
		}

	      // Check if the line specifically indicates a chapter heading.
	      if (MatchSpecialWords (Dataset, ss, &Line))
		Line.BeginsBook = 1;
	      if (Line.Offset == Dataset->TextStart)
		Line.Scanned = 1;

	      // Check if the line begins with a Roman numeral.
	      Line.BeginsRoman = isroman (*ss);
	      for (sss = ss + 1; *sss && !isspace (*sss) && !ispunct (*sss);
		   sss++)
		if (!isroman (*sss))
		  {
		    Line.BeginsRoman = 0;
		    break;
		  }
	      if (Line.BeginsRoman)
		{
		  siv = sss;
		  if (*siv == '.')
		    siv++;
		  while (isspace (*siv))
		    siv++;
		  if (*siv == '\0')
		    Line.OnlyRoman = 1;
		}
	      Line.WhiteAfterRoman = (Line.BeginsRoman && isspace (*sss));
	      Line.SpecialRoman = (Line.BeginsRoman && sss == ss + 1 &&
				   (*ss == 'I' || *ss == 'X' || *ss == 'C'));
	      if (Line.SpecialRoman)
		{

		  // If "I" (or "I." is the ONLY thing on the line, 
		  // then it's just as good as any other Roman number.
		  if (*sss == '.')
		    sss++;
		  for (; isspace (*sss); sss++);
		  if (*sss == '\0')
		    Line.SpecialRoman = 0;
		}

	      // See if there's a period at the end of the line.
	      for (sss = ss; *sss && *sss != '\n'; sss++);
	      for (; sss > s && isspace (*sss); sss--)
		if (*sss == '\f')
		  /* Line.FormFeedAfter = 1 */ ;
	      Line.WeirdSequences = Line.WeirdSequences || (*sss == '|');
	      if (sss > s && IsEndPeriod (sss[0]))
		Line.EndPeriod = 1;
	      if (sss > s + 1 && IsEndPeriod (sss[-1])
		  && (sss[0] == '\"' || sss[0] == '\''))
		Line.EndPeriod = 1;
	      if (sss - ss <= Dataset->ShortLineSize)
		Line.Short = 1;
	      if (sss - ss <= Dataset->ReallyShortLineSize)
		Line.ReallyShort = 1;
	    }
	}

      // Look for other weird stuff.
      Line.WeirdSequences = Line.WeirdSequences ||
	NULL != strstr (s, ".....") ||
	NULL != strstr (s, ". . . . .") ||
	NULL != strstr (s, "-----") || NULL != strstr (s, "___");

      // Write the line data.                   
      if (1 != fwrite (&Line, sizeof (Line), 1, Dataset->LineFile))
	{
	  fprintf (stderr, "Disk-write error.\n");
	  ReturnValue = 3;
	  goto Done;
	}
    }
  ReturnValue = 0;
Done:
  Dataset->NumLines = ftell (Dataset->LineFile) / sizeof (LineRecord);
  return (ReturnValue);
}