/////////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	Markup.c 
  Purpose:	This is the analysis function used by AutoMark.c
  Mods:		08/30/01 RSB	Began.
  		11/02/01 RSB	Added GPL disclaimer and reformatted 
				somewhat for first web release.
  
  This file analyzes the input file, and writes MarkupRecord records to the 
  markup file.  The markup records are written in the same order they will
  eventually appear within the HTML.  I was going to write this to accomodate
  either DOS or Unix end-of-line conventions, but at least at first, I've 
  decided to just assume the text file is in the native format, so that
  lines are terminated with \n.
*/
#include <stdio.h>
#include "AutoMark.h"

//---------------------------------------------------------------------

int
Markup (AnalysisDataset * Dataset)
{
  unsigned SizeSum;
  int NumSamples;
  int i, j, MaxInGroup;
  char s[256];

  // First, let's analyze the input file to see if it has a header.
  // The various analysis functions for different header types return 
  // non-zero when that particular header type has been found.  When it 
  // returns zero, the header hasn't been found, so checks for other 
  // header types can be carried out.
  if (0 == CheckGutenbergHeader (Dataset))
    {
    }

  // Let's determine the aproximate margins.  What we do is to take the 
  // longest line out of each group of fify lines, and then average them.  
  // We don't include the Gutenberg header, nor any partial group at the 
  // end of the file.  
  fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
  s[sizeof (s) - 1] = 0;
  for (SizeSum = NumSamples = 0; NumSamples < 100;
       SizeSum += MaxInGroup, NumSamples++)
    {
      for (MaxInGroup = i = 0; i < 50; i++)
	{
	  if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile))
	    break;
	  j = strlen (s);
	  if (j > MaxInGroup)
	    MaxInGroup = j;
	}
      if (i < 50)
	break;
    }
  if (NumSamples == 0)
    {
      Dataset->ShortLineSize = 50;
      Dataset->ReallyShortLineSize = 35;
    }
  else
    {
      SizeSum /= NumSamples;

      //Dataset->ShortLineSize = (5 * SizeSum) / 7;
      //Dataset->ReallyShortLineSize = SizeSum / 2;
      Dataset->ShortLineSize = (10 * SizeSum) / 13;
      Dataset->ReallyShortLineSize = (5 * SizeSum) / 9;
    }

  // Let's mark paragraphs.
  if (MarkBody (Dataset))
    {
      fprintf (stderr, "Error marking-up text.\n");
      return (1);
    }
  return (0);
}