///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: AutoMark.h Purpose: Header used by AutoMark.c and friends. Mods: 08/30/01 RSB Began. 09/03/01 RSB Added footnote stuff. 11/02/01 RSB Added GPL disclaimer and reformatted somewhat for first web release. 11/09/01 RSB Added MarkInsertMdash and MarkInsertNdash. 11/12/01 RSB Added the YesHeader option. Added MarkGutenbergEnder. 11/13/01 RSB Added MarkBeginSmartQuote and MarkEndSmartQuote. 11/24/01 RSB Added NoForeign. 11/25/01 RSB Added Joe Cherry patch (JCPATCH). 12/09/01 RSB Added FirstItalics and FirstCapital. 12/10/01 RSB Added SingleSpace and ForceNumeric. 12/13/01 RSB Added WeirdSequences. 12/15/01 RSB Added various stuff that had been in MarkBody.c before it was split into MarkBody.c+MarkByChar.c. Added HTML_LENGTH. 12/30/01 RSB Added various functional prototypes associated with splitting MarkByLineHeuristic from MarkBody. also added ForceNeuralNet and NeuralLearn (not that they do anything yet). 01/01/02 RSB Added IsHeaderHeuristic. 01/13/02 RSB Added Xml field. Added OutputHtml, OutputLatex, OutputXml prototypes. 01/18/02 RSB Added NoPrefatory and PageBreaks. 06/15/02 RSB Added NoParskip While there's no real upper bound on the number of POSSIBLE markups in a file, as a practical matter the number of markups is going to be approximately equal to the number of sentences (since each sentence has an   markup at the end) plus the number of paragraphs (since each has a leading

and trailing

markup). So for a very large 10M file, with an average sentence length of 50 characters, and an average paragraph length of 5 sentences, and with sizeof(MarkupRecord)=8, the maximum amount of markup data is about 2M. */ #ifndef INCLUDED_AUTOMARK_H #define INCLUDED_AUTOMARK_H #ifdef __BORLANDC__ #include #include #include #define strcasecmp strcmpi #define strncasecmp strncmpi #endif #include "libGutenSpell/libGutenSpell.h" //------------------------------------------------------------------------ // Constants. #define AUTOMARK_MAX_LINESIZE 1000 //#define SHORT_LINE 50 //#define REALLY_SHORT_LINE 35 // These constants are used for detecting the "prefatory" area. #define MAX_PREFATORY_LINES 1000 #define MIN_TEXT_LINES 20 // Number of lines prior to and following the current line to consider in // the analysis. #define PRE_OR_POST_LINES 10 #define BUFFERED_LINES (2*PRE_OR_POST_LINES+1) // Number of headings which, once matched from the prefatory area, // force all further matches to come from there (or from special words). #define PREFATORY_MATCH_CEILING 4 // Number of foreign words for a phrase we try to retain in memory. // Don't increase this, since 16 is already overkill. #define MAX_BACKTRACKS 16 // Length of an HTML line beyond which we'll try to wrap in a // paragraph. #define HTML_LENGTH 50 //----------------------------------------------------------------------- // Datatypes. // Within the body of the text, only the following markups will be made. enum MarkupType { MarkNoMoreMarks, MarkRemoveChar, MarkNbsp, MarkBeginJustifiedParagraph, MarkBeginRaggedParagraph, MarkBeginCenteredParagraph, MarkEndParagraph, MarkBeginItalics, MarkEndItalics, MarkBeginBold, MarkEndBold, MarkBeginUnderline, MarkEndUnderline, MarkJumpPastGutenbergHeader, MarkEndOfGutenbergHeader, MarkBreak, MarkBlockquote, MarkEndBlockquote, MarkHeader1, MarkEndHeader1, MarkInsertChar, MarkBeginSubtitle, MarkEndSubtitle, MarkBeginTable, MarkEndTable, MarkInsertMdash, MarkInsertNdash, MarkGutenbergEnder, MarkBeginSmartQuote, MarkEndSmartQuote, MarkLsquo, MarkRsquo, MarkSoftHyphen, MarkToupper, MarkTolower }; // This type of record is used to save info about a markup. typedef struct { unsigned long Offset; // Where in the input file the markup occurs. enum MarkupType Type; // Type of markup at that point. char Insert; // Only for MarkInsertChar; } MarkupRecord; // This type of record is used to save info about the first-pass // analysis of all lines in the input file. The info in this structure is // geared towards heuristically locating paragraph breaks, headings, and/or // verse. Therefore, the definitions of some items may not be as expected. typedef struct { unsigned long Offset; // Offset of the line in the input file. unsigned Empty:1; // The line is just whitespace. unsigned BeginsWhite:1; // The line begins with whitespace. unsigned BeginsQuote:1; // The line begins with " or ' unsigned CapFirstChar:1; // The first character is capitalized. unsigned CapFirstWord:1; // The first word in the line is capitalized. unsigned AllCaps:1; // The line is all caps. unsigned VerseCap:1; // Line begins with cap or quote-cap. unsigned BeginsBook:1; // Begins with "Book" or "BOOK". unsigned BeginsChapter:1; // Begins "Chapter", "Chap.", "CHAPTER", etc. unsigned BeginsRoman:1; // Begins with a Roman number. unsigned SpecialRoman:1; // The Roman number is "I", "X", or "C". unsigned WhiteAfterRoman:1; // Roman number is followed by whitespace. unsigned OnlyRoman:1; // The line is ONLY a roman number. unsigned EndPeriod:1; // Ends with a period. unsigned OutOfRange:1; // Dummy flag, fake record not in file. unsigned Contents:1; // Begins "Contents" or "Table of Contents". unsigned Short:1; // The line is shorter than expected. unsigned ReallyShort:1; // ... and even more so! unsigned Scanned:1; // The line is a "scanned by" line. unsigned RaggedStart:1; // A line starting with ragged space. unsigned FormFeedPrior:1; // Form feed in the space prior to the text. unsigned FormFeedAfter:1; unsigned PossibleTable:1; // Set if might be tabular data. // Various odd sequences of characters may appear in things like line // drawings that are weird enough to short circuit normal justification, // and which therefore should cause the line to be preferentially treated // like preformatted text. Some of these weirdities are: 5 or more // periods in a row, 4 or more dashes in a row, lines beginning or // ending with a vertical bar, etc. unsigned WeirdSequences:1; unsigned Leading:6; // Actual leading space. } LineRecord; // I've added this in preparation for using footnotes, but the more I // think about it the more difficult it becomes. So, this structure is // probably not used for anything ... struct FootnoteRecord { struct FootnoteRecord *Next; // Pointer to next, NULL if at end of list. char *OriginalText; // As it appeared in the body of the text, // minus any brackets or braces. unsigned Pointer:1; // 1 if it wasn't the full text of the note. unsigned Brackets:1; // 1 if brackets enclosed the footnote. unsigned Braces:1; // 1 if braces enclosed the footnote. unsigned PreString:1; // 1 if a string preceded the note number. unsigned Pound:1; // 1 if a pound sign preceded the note number. unsigned short Number; // The original footnote number. }; // This is the full set of data for analyzing the file. typedef struct { FILE *InputFile; FILE *MarkupFile; FILE *LineFile; FILE *LogFile; // If the file has a header, like a Gutenberg header, this variable // contains the offset of the end of the header. unsigned long TextStart; unsigned long FileEnderRegion; // file length - 500. // This variable is used to distinguish between paragraph styles in which // blank lines separate paragraphs (0), and those in which the paragraphs // are indented (1). int IndentedParagraphs; // Total number of input lines not in the header. int NumLines; int LowestPrefatoryLine; int LowestNonPrefatoryLine; // Lines after PG header prior to actual text. unsigned ShortLineSize; unsigned ReallyShortLineSize; // Footnotes. struct FootnoteRecord *Footnotes; // Command-line options. int NoJustify; int NoMdash; int YesHeader; int NoForeign; int NoDiacritical; int Latex; // JCPATCH int FirstItalics; // 12/09/01 RSB int FirstCapital; // 12/09/01 RSB int SingleSpace; // 12/10/01 RSB int ForceNumeric; // 12/10/01 RSB int ForceNeural; // 12/30/01 RSB int NeuralLearn; // 12/30/01 RSB int Xml; // 01/13/02 RSB int NoPrefatory; // 01/18/02 RSB int PageBreaks; // 01/18/02 RSB int NoParskip; // 06/15/02 RSB // Here's where lines from the "prefatory" area are buffered. char *PrefatoryLines[MAX_PREFATORY_LINES]; int PrefatoryLineSizes[MAX_PREFATORY_LINES]; int NumNonTrivialPrefatoryLines; int NumPrefatoryMatched; // The wordlist for the file. Wordlist *Words; } AnalysisDataset; // Data held in common by MarkBody, MarkByLine, etc. typedef struct { int InParagraph, InBlockquote, InHeader1, InSubtitle, LastWasHeader1, ErrorCode, InTable, Versifying, LastVersifying, BracketItalicsCount; int FirstWordArea; LineRecord BufferedLines[BUFFERED_LINES], *LineInfo; int CouldBeName; int Italicizing, Centering, InWord; int WordItalicizing; // NOTE: ALWAYS 0 now. int TripSquote; int Foreignosity, LastForeignosity, InPreface; int SentenceStart; int BlockIndentation; enum MarkupType ParagraphType, Type; struct { long Start; // Starting offset in etext. long End; // Ending offset in etext. unsigned long Languages; } ForeignBacktracks[MAX_BACKTRACKS]; int NumBacktracks; long LastFirstSpace; } MarkStatus; //------------------------------------------------------------------------- // Function prototypes. int OutputHtml (FILE *, AnalysisDataset *); int OutputLatex (FILE *, AnalysisDataset *); int OutputXml (FILE *, AnalysisDataset *); int Markup (AnalysisDataset * Dataset); int CheckGutenbergHeader (AnalysisDataset * Dataset); int MarkBody (AnalysisDataset * Dataset); int LineAnalysisPass (AnalysisDataset * Dataset); int MatchSpecialWords (AnalysisDataset * Dataset, char *ss, LineRecord * Line); char *rfgets (char *s, int size, FILE * fp); int AddMarkup (AnalysisDataset * Dataset, long Offset, enum MarkupType Type, char Insert); int MarkByChar (AnalysisDataset * Dataset, MarkStatus * Status, char *s); int IsEndPunct (char c); int IsEndSpace (char c); int MatchesPrefatoryLine (AnalysisDataset * Dataset, char *Line); int IsHeader (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum, AnalysisDataset * Dataset, char *Line); int IsHeaderHeuristic (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum, AnalysisDataset * Dataset, char *Line); int IsHeaderNeural (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum, AnalysisDataset * Dataset, char *Line); int MarkByLineHeuristic (AnalysisDataset * Dataset, MarkStatus * Status, int LineNum, char *s); int MarkByLineNeural (AnalysisDataset * Dataset, MarkStatus * Status, int LineNum, char *s); void PrefatoryAnalysisPassHeuristic (AnalysisDataset * Dataset); void PrefatoryAnalysisPassNeural (AnalysisDataset * Dataset); int NormalizePotentialHeader (char *s); void NormalizeTitle (char *s, int size); #endif // INCLUDED_AUTOMARK_H