///////////////////////////////////////////////////////////////////////////// 
/* 
  Copyright 2001 Ronald S. Burkey 
 
  This file is part of GutenMark. 
 
  GutenMark is free software; you can redistribute it and/or modify 
  it under the terms of the GNU General Public License as published by 
  the Free Software Foundation; either version 2 of the License, or 
  (at your option) any later version. 
 
  GutenMark is distributed in the hope that it will be useful, 
  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  GNU General Public License for more details. 
 
  You should have received a copy of the GNU General Public License 
  along with GutenMark; if not, write to the Free Software 
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 
  Filename:	PrefatoryAnalysisPassHeuristic.c 
  Purpose:	Identifies the document's "prefatory area" using a heuristic
  		method.  An alternate drop-in replacement using a neural-net
		method may be selected instead at runtime. 
  Mods:		01/01/02 RSB	Moved and renamed from MarkBody.c.
  		06/15/02 RSB	Corrected a shocking indexing error.  How
				could this ever have worked? 
*/
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "AutoMark.h"

//--------------------------------------------------------------------------- 
// Attempt to find the "prefatory" area, which is what I call the area between 
// the PG header and the actual text.  The net result of the pass is simply to 
// set Dataset->NumPrefatoryLines.  The basic technique is to find, within the  
// first MAX_PREFATORY_LINES the first candidate heading followed by at least 
// MIN_TEXT_LINES of non-headings.  Obviously, some additional flourishes may 
// be required.  For example, if we find a non-blank line that's a duplicate  
// of a prior line, it is probably the first section. 
// 
// The reason I'm doing this at all is that if the Gutenberg text contains a  
// "table of contents", we really messes things up in terms of extraneous  
// headings.  So if I can find something labeled "contents"  or "table of  
// contents", I'll include it even if it's outside of these parameters. 
void
PrefatoryAnalysisPassHeuristic (AnalysisDataset * Dataset)
{
  int i, j, k, TextLines, LastWasHeader1;
  LineRecord BufferedLines[BUFFERED_LINES], DefaultLine = {
    0
  };
  char s[256], *ss;
  Dataset->LowestPrefatoryLine = 0;
  Dataset->LowestNonPrefatoryLine = 0;

  // See if we can find something labeled as the table of contents. 
  // Everything in front of that will be included in the prefatory 
  // area by default (but won't be buffered). 
  s[sizeof (s) - 1] = 0;
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  for (i = 0; i < MAX_PREFATORY_LINES; i++)
    if (NULL != rfgets (s, sizeof (s) - 1, Dataset->InputFile))
      {
	NormalizePotentialHeader (s);
	if (0 == strcmp (s, "CONTENTS")
	    || 0 == strcmp (s, "TABLE OF CONTENTS")
	    || 0 == strncmp (s, "CONTENTS OF", 11)
	    || 0 == strncmp (s, "TABLE OF CONTENTS OF", 20))
	  {
	    Dataset->LowestPrefatoryLine = Dataset->LowestNonPrefatoryLine =
	      i;
	    break;
	  }
      }
    else
      break;

  // Load up the Status.BufferedLines array. 
  DefaultLine.OutOfRange = 1;
  DefaultLine.Empty = 1;
  fseek (Dataset->LineFile, 0, SEEK_SET);
  for (i = 0; i < PRE_OR_POST_LINES; i++)
    BufferedLines[i] = DefaultLine;
  for (; i < BUFFERED_LINES; i++)
    {
      if (1 !=
	  fread (BufferedLines + i, sizeof (LineRecord), 1,
		 Dataset->LineFile))
	BufferedLines[i] = DefaultLine;
    }

  // Now do the actual search. 
  TextLines = -MAX_PREFATORY_LINES;
  for (i = 0; i < MAX_PREFATORY_LINES; i++)
    {
      if (i >= Dataset->LowestPrefatoryLine)
	{
	  LastWasHeader1 = 0;
	  if (!BufferedLines[PRE_OR_POST_LINES].Contents
	      && IsHeader (BufferedLines, &LastWasHeader1, i, Dataset, NULL))
	    {
	      Dataset->LowestNonPrefatoryLine = i;
	      TextLines = 0;
	    }
	  else
	    TextLines++;
	  if (TextLines > MIN_TEXT_LINES)
	    break;
	}

      // Load the next line.     
      for (j = 0; j < BUFFERED_LINES - 1; j++)
	BufferedLines[j] = BufferedLines[j + 1];
      if (1 !=
	  fread (BufferedLines + BUFFERED_LINES - 1, sizeof (LineRecord), 1,
		 Dataset->LineFile))
	BufferedLines[j /* 06/15/02 RSB.  Shockingly, was i! */] = DefaultLine;
    }
  if (TextLines <= MIN_TEXT_LINES)
    Dataset->LowestNonPrefatoryLine = Dataset->LowestPrefatoryLine;
  fseek (Dataset->LineFile, 0, SEEK_SET);

  // Now that the prefatory area has been located, buffer the actual lines 
  // from it.  But don't allow duplicates.  If duplicates are found,  
  // then shorten up the area. 
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  s[sizeof (s) - 1] = 0;
  Dataset->NumNonTrivialPrefatoryLines = 0;
  for (i = 0; i < Dataset->LowestNonPrefatoryLine; i++)
    {
      rfgets (s, sizeof (s) - 1, Dataset->InputFile);
      if (i < Dataset->LowestPrefatoryLine)
	continue;
      j = NormalizePotentialHeader (s);
      if (j > 5)
	{

	  // Here we have an a non-trivial line.  Let's check to see if we've  
	  // already buffered it. 
	  for (k = 0; k < Dataset->NumNonTrivialPrefatoryLines; k++)
	    if (0 == strcmp (s, Dataset->PrefatoryLines[k]))
	      {

		// It's a duplicate! 
		Dataset->LowestNonPrefatoryLine = i;
		goto DoneBuffering;
	      }

	  // Haven't buffered it yet, so it's okay to continue. 
	  ss = (char *) calloc (1, j + 1);
	  if (ss != NULL)
	    {
	      Dataset->PrefatoryLines[Dataset->NumNonTrivialPrefatoryLines] =
		ss;
	      strcpy (ss, s);
	      Dataset->PrefatoryLineSizes[Dataset->
					  NumNonTrivialPrefatoryLines] = j;
	      (Dataset->NumNonTrivialPrefatoryLines)++;
	    }
	}
    }
DoneBuffering:fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
}