/////////////////////////////////////////////////////////////////////////////
/*
  Copyright 2001 Ronald S. Burkey

  This file is part of GutenMark.

  GutenMark is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  GutenMark is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with GutenMark; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Filename:	AutoMark.h 
  Purpose:	Header used by AutoMark.c and friends.
  Mods:		08/30/01 RSB	Began.
  		09/03/01 RSB	Added footnote stuff.
		11/02/01 RSB	Added GPL disclaimer and reformatted
				somewhat for first web release.
		11/09/01 RSB	Added MarkInsertMdash and MarkInsertNdash.
		11/12/01 RSB	Added the YesHeader option.
				Added MarkGutenbergEnder.
		11/13/01 RSB	Added MarkBeginSmartQuote and MarkEndSmartQuote.
		11/24/01 RSB	Added NoForeign.
		11/25/01 RSB	Added Joe Cherry patch (JCPATCH).
		12/09/01 RSB	Added FirstItalics and FirstCapital.
		12/10/01 RSB	Added SingleSpace and ForceNumeric.
		12/13/01 RSB	Added WeirdSequences.
		12/15/01 RSB	Added various stuff that had been in
				MarkBody.c before it was split into
				MarkBody.c+MarkByChar.c.  Added HTML_LENGTH.
		12/30/01 RSB	Added various functional prototypes associated
				with splitting MarkByLineHeuristic from
				MarkBody.  also added ForceNeuralNet and NeuralLearn
				(not that they do anything yet).
		01/01/02 RSB	Added IsHeaderHeuristic.
		01/13/02 RSB	Added Xml field.  Added OutputHtml,
				OutputLatex, OutputXml prototypes.
		01/18/02 RSB	Added NoPrefatory and PageBreaks.
		06/15/02 RSB	Added NoParskip						
  
  While there's no real upper bound on the number of POSSIBLE markups 
  in a file, as a practical matter the number of markups is going to be 
  approximately equal to the number of sentences (since each sentence 
  has an &nbsp; markup at the end) plus the number of paragraphs (since 
  each has a leading <p> and trailing </p> markup).  So for a very large 
  10M file, with an average sentence length of 50 characters, and an 
  average paragraph length of 5 sentences, and with 
  sizeof(MarkupRecord)=8, the maximum amount of markup data is about 2M.
*/
#ifndef INCLUDED_AUTOMARK_H
#define INCLUDED_AUTOMARK_H
#ifdef __BORLANDC__
#include <string.h>
#include <alloc.h>
#include <ctype.h>
#define strcasecmp strcmpi
#define strncasecmp strncmpi
#endif
#include "libGutenSpell/libGutenSpell.h"

//------------------------------------------------------------------------
// Constants.

#define AUTOMARK_MAX_LINESIZE 1000

//#define SHORT_LINE 50
//#define REALLY_SHORT_LINE 35

// These constants are used for detecting the "prefatory" area.
#define MAX_PREFATORY_LINES 1000
#define MIN_TEXT_LINES 20

// Number of lines prior to and following the current line to consider in 
// the analysis. 
#define PRE_OR_POST_LINES 10
#define BUFFERED_LINES (2*PRE_OR_POST_LINES+1)

// Number of headings which, once matched from the prefatory area,  
// force all further matches to come from there (or from special words). 
#define PREFATORY_MATCH_CEILING 4

// Number of foreign words for a phrase we try to retain in memory.
// Don't increase this, since 16 is already overkill.
#define MAX_BACKTRACKS 16

// Length of an HTML line beyond which we'll try to wrap in a 
// paragraph.
#define HTML_LENGTH 50

//-----------------------------------------------------------------------
// Datatypes.

// Within the body of the text, only the following markups will be made.
enum MarkupType
{
  MarkNoMoreMarks,
  MarkRemoveChar,
  MarkNbsp,
  MarkBeginJustifiedParagraph,
  MarkBeginRaggedParagraph,
  MarkBeginCenteredParagraph,
  MarkEndParagraph,
  MarkBeginItalics,
  MarkEndItalics,
  MarkBeginBold,
  MarkEndBold,
  MarkBeginUnderline,
  MarkEndUnderline,
  MarkJumpPastGutenbergHeader,
  MarkEndOfGutenbergHeader,
  MarkBreak,
  MarkBlockquote,
  MarkEndBlockquote,
  MarkHeader1,
  MarkEndHeader1,
  MarkInsertChar,
  MarkBeginSubtitle,
  MarkEndSubtitle,
  MarkBeginTable,
  MarkEndTable,
  MarkInsertMdash,
  MarkInsertNdash,
  MarkGutenbergEnder,
  MarkBeginSmartQuote,
  MarkEndSmartQuote,
  MarkLsquo,
  MarkRsquo,
  MarkSoftHyphen,
  MarkToupper,
  MarkTolower
};

// This type of record is used to save info about a markup.
typedef struct
{
  unsigned long Offset;		// Where in the input file the markup occurs.
  enum MarkupType Type;		// Type of markup at that point.  
  char Insert;			// Only for MarkInsertChar;
}
MarkupRecord;

// This type of record is used to save info about the first-pass
// analysis of all lines in the input file.  The info in this structure is 
// geared towards heuristically locating paragraph breaks, headings, and/or 
// verse.  Therefore, the definitions of some items may not be as expected.  
typedef struct
{
  unsigned long Offset;		// Offset of the line in the input file.
  unsigned Empty:1;		// The line is just whitespace.
  unsigned BeginsWhite:1;	// The line begins with whitespace.
  unsigned BeginsQuote:1;	// The line begins with " or '
  unsigned CapFirstChar:1;	// The first character is capitalized.
  unsigned CapFirstWord:1;	// The first word in the line is capitalized.
  unsigned AllCaps:1;		// The line is all caps.
  unsigned VerseCap:1;		// Line begins with cap or quote-cap.
  unsigned BeginsBook:1;	// Begins with "Book" or "BOOK".
  unsigned BeginsChapter:1;	// Begins "Chapter", "Chap.", "CHAPTER", etc.
  unsigned BeginsRoman:1;	// Begins with a Roman number.
  unsigned SpecialRoman:1;	// The Roman number is "I", "X", or "C".
  unsigned WhiteAfterRoman:1;	// Roman number is followed by whitespace.
  unsigned OnlyRoman:1;		// The line is ONLY a roman number.
  unsigned EndPeriod:1;		// Ends with a period.
  unsigned OutOfRange:1;	// Dummy flag, fake record not in file.
  unsigned Contents:1;		// Begins "Contents" or "Table of Contents".
  unsigned Short:1;		// The line is shorter than expected.
  unsigned ReallyShort:1;	// ... and even more so!
  unsigned Scanned:1;		// The line is a "scanned by" line.
  unsigned RaggedStart:1;	// A line starting with ragged space.
  unsigned FormFeedPrior:1;	// Form feed in the space prior to the text.
  unsigned FormFeedAfter:1;
  unsigned PossibleTable:1;	// Set if might be tabular data.
  // Various odd sequences of characters may appear in things like line
  // drawings that are weird enough to short circuit normal justification,
  // and which therefore should cause the line to be preferentially treated
  // like preformatted text.  Some of these weirdities are:  5 or more 
  // periods in a row, 4 or more dashes in a row, lines beginning or 
  // ending with a vertical bar, etc.
  unsigned WeirdSequences:1;
  unsigned Leading:6;		// Actual leading space.
}
LineRecord;

// I've added this in preparation for using footnotes, but the more I 
// think about it the more difficult it becomes.  So, this structure is 
// probably not used for anything ...
struct FootnoteRecord
{
  struct FootnoteRecord *Next;	// Pointer to next, NULL if at end of list.
  char *OriginalText;		// As it appeared in the body of the text,

  // minus any brackets or braces.
  unsigned Pointer:1;		// 1 if it wasn't the full text of the note.
  unsigned Brackets:1;		// 1 if brackets enclosed the footnote.
  unsigned Braces:1;		// 1 if braces enclosed the footnote.
  unsigned PreString:1;		// 1 if a string preceded the note number.
  unsigned Pound:1;		// 1 if a pound sign preceded the note number.
  unsigned short Number;	// The original footnote number.
};

// This is the full set of data for analyzing the file.
typedef struct
{
  FILE *InputFile;
  FILE *MarkupFile;
  FILE *LineFile;
  FILE *LogFile;

  // If the file has a header, like a Gutenberg header, this variable 
  // contains the offset of the end of the header.
  unsigned long TextStart;
  unsigned long FileEnderRegion;	// file length - 500.

  // This variable is used to distinguish between paragraph styles in which 
  // blank lines separate paragraphs (0), and those in which the paragraphs 
  // are indented (1).
  int IndentedParagraphs;

  // Total number of input lines not in the header.
  int NumLines;
  int LowestPrefatoryLine;
  int LowestNonPrefatoryLine;	// Lines after PG header prior to actual text.
  unsigned ShortLineSize;
  unsigned ReallyShortLineSize;

  // Footnotes.
  struct FootnoteRecord *Footnotes;

  // Command-line options.
  int NoJustify;
  int NoMdash;
  int YesHeader;
  int NoForeign;
  int NoDiacritical;
  int Latex;			// JCPATCH
  int FirstItalics;		// 12/09/01 RSB
  int FirstCapital;		// 12/09/01 RSB
  int SingleSpace;		// 12/10/01 RSB
  int ForceNumeric;		// 12/10/01 RSB
  int ForceNeural;		// 12/30/01 RSB
  int NeuralLearn;		// 12/30/01 RSB
  int Xml;			// 01/13/02 RSB
  int NoPrefatory;		// 01/18/02 RSB
  int PageBreaks;		// 01/18/02 RSB
  int NoParskip;		// 06/15/02 RSB

  // Here's where lines from the "prefatory" area are buffered.
  char *PrefatoryLines[MAX_PREFATORY_LINES];
  int PrefatoryLineSizes[MAX_PREFATORY_LINES];
  int NumNonTrivialPrefatoryLines;
  int NumPrefatoryMatched;

  // The wordlist for the file.
  Wordlist *Words;
}
AnalysisDataset;

// Data held in common by MarkBody, MarkByLine, etc.
typedef struct
{
  int InParagraph, InBlockquote, InHeader1, InSubtitle, LastWasHeader1,
    ErrorCode, InTable, Versifying, LastVersifying, BracketItalicsCount;
  int FirstWordArea;
  LineRecord BufferedLines[BUFFERED_LINES], *LineInfo;
  int CouldBeName;
  int Italicizing, Centering, InWord;
  int WordItalicizing;		// NOTE:  ALWAYS 0 now. 
  int TripSquote;
  int Foreignosity, LastForeignosity, InPreface;
  int SentenceStart;
  int BlockIndentation;
  enum MarkupType ParagraphType, Type;
  struct
  {
    long Start;			// Starting offset in etext. 
    long End;			// Ending offset in etext. 
    unsigned long Languages;
  }
  ForeignBacktracks[MAX_BACKTRACKS];
  int NumBacktracks;
  long LastFirstSpace;
}
MarkStatus;

//-------------------------------------------------------------------------
// Function prototypes.

int OutputHtml (FILE *, AnalysisDataset *);
int OutputLatex (FILE *, AnalysisDataset *);
int OutputXml (FILE *, AnalysisDataset *);
int Markup (AnalysisDataset * Dataset);
int CheckGutenbergHeader (AnalysisDataset * Dataset);
int MarkBody (AnalysisDataset * Dataset);
int LineAnalysisPass (AnalysisDataset * Dataset);
int MatchSpecialWords (AnalysisDataset * Dataset, char *ss,
		       LineRecord * Line);
char *rfgets (char *s, int size, FILE * fp);
int AddMarkup (AnalysisDataset * Dataset, long Offset,
	       enum MarkupType Type, char Insert);
int MarkByChar (AnalysisDataset * Dataset, MarkStatus * Status, char *s);
int IsEndPunct (char c);
int IsEndSpace (char c);
int MatchesPrefatoryLine (AnalysisDataset * Dataset, char *Line);
int IsHeader (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum,
	      AnalysisDataset * Dataset, char *Line);
int IsHeaderHeuristic (LineRecord * BufferedLines, int *LastWasHeader1,
		       int LineNum, AnalysisDataset * Dataset, char *Line);
int IsHeaderNeural (LineRecord * BufferedLines, int *LastWasHeader1,
		    int LineNum, AnalysisDataset * Dataset, char *Line);
int MarkByLineHeuristic (AnalysisDataset * Dataset, MarkStatus * Status,
			 int LineNum, char *s);
int MarkByLineNeural (AnalysisDataset * Dataset, MarkStatus * Status,
		      int LineNum, char *s);
void PrefatoryAnalysisPassHeuristic (AnalysisDataset * Dataset);
void PrefatoryAnalysisPassNeural (AnalysisDataset * Dataset);
int NormalizePotentialHeader (char *s);
void NormalizeTitle (char *s, int size);

#endif // INCLUDED_AUTOMARK_H