///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: MarkBody.c Purpose: This analyzes the body of the text. Mods: 08/31/01 RSB Began. 09/03/01 RSB N, S, E, and W added to honorifics (really street names, but works the same). 09/04/01 RSB Now replace '<' & '>' & '&', since they can be mistaken for markup. Add closing markup on reaching end of file. 09/06/01 RSB Process paragraphs to try and deduce whether to make them justified or ragged. 09/08/01 RSB Lotsa cleanup and tweaking. 09/15/01 RSB Now account for the ~~ italics delimiters. Also, the /italics/ format. Added more heuristics for detecting verse. 11/02/01 RSB Added GPL disclaimer and reformatted somewhat for first web release. 11/09/01 RSB Now convert all strings of dashes of length 2 or more to mdashes and ndashes. In other words, "--" converts to — "---" converts to —– "----" converts to —— and so on. These don't look good in html, but look great in postscript or pdf. Also, the dash in " - " is converted to an mdash. 11/12/01 RSB Added detection of the PG file-ender. 11/13/01 RSB Added smart quotes. 11/23/01 RSB Added ALL-CAPs and foreign italicizing. 11/24/01 RSB Centralized all of the markup-file output operations into the AddMarkup function. 8-bit ASCII restoration. 11/25/01 RSB Resolved conflicts between 8-bit restoration and ALL-CAPS/foreign markup. 12/09/01 RSB Fixed bug in which an ALL-CAPS word at the beginning of a section (or other odd circumstance) might be treated as being in the middle of a sentence. Began adding support for --first-italics and --first-capital. 12/10/01 RSB Modified Nbsp markup after colon or between sentences with SingleSpace. Added some HTML beautification: The stuff involving Status.LastFirstSpace is all used to put closing tags for things like paragraphs, headings, etc. BEFORE newlines rather than after them. 12/11/01 RSB Fixed that hanging

that was always being stuck at the file end due to the unused PG "end" line. When backtracking through foreign words, it was possible to double- italicize words that had already been italicized due to being ALL-CAPS. The ALL-CAPS condition now terminates the backtrack list. 12/13/01 RSB Added WeirdSequences, LikelyName, and WantFirstCharUpper. 12/15/01 RSB Split off MarkByChar from MarkBody to make the code look a little simpler. 12/16/01 RSB Added additional code to the centering recognizer to distinguish block quotes whose first and/or last lines are simply indented differently. Also, added code to the blockquote processor for this case. 12/27/01 RSB Fixed a bug in which the markup for the PG ender would be at offset 0 if there was no header. 12/30/01 RSB Split MarkByLineHeuristic from the MarkBody function, to make the heuristic approach more easily replaceable by a neural-net approach. 01/01/02 RSB Moved AddMarkup function to its own source file. Split IsHeader into IsHeaderNeural and IsHeaderHeuristic, each in its own soard file. Similarly, split PrefatoryAnalysisPass function. */ #include #include #include #include #include "AutoMark.h" //----------------------------------------------------------------------- // Normalizes a potential heading string in-place. Returns the length of // the normalized string. int NormalizePotentialHeader (char *s) { char *ss; int j; // Normalize the line by removing leading spaces. for (ss = s; isspace (*ss); ss++); if (ss != s) strcpy (s, ss); // Also, remove any trailing spaces or punctuation. j = strlen (s); if (j > 0) { for (ss = &s[j - 1]; ss >= s; ss--) { if (isspace (*ss) || ispunct (*ss)) *ss = 0; else break; } } // Get rid of multiple spaces. for (ss = s; *ss; ss++) if (isspace (*ss)) { *ss = ' '; if (isspace (ss[1])) { strcpy (ss, ss + 1); ss--; } } // Oh, and let's make it upper-case. for (ss = s; *ss; ss++) *ss = toupper (*ss); j = strlen (s); return (j); } //--------------------------------------------------------------------------- // Checks a line to see if it's a match against the buffered prefatory lines. int MatchesPrefatoryLine (AnalysisDataset * Dataset, char *Line) { char s[256]; int i; if (NULL != Line) { strcpy (s, Line); NormalizePotentialHeader (s); for (i = 0; i < Dataset->NumNonTrivialPrefatoryLines; i++) if (0 == strncmp (s, Dataset->PrefatoryLines[i], Dataset->PrefatoryLineSizes[i]) && strlen (s) < 2 * Dataset->PrefatoryLineSizes[i]) return (1); } return (0); } //------------------------------------------------------------------------ // This function is used to recognize headings. Returns 0 if it thinks the // line is not a header. The array Status.BufferedLines[BUFFERED_LINES] should have // been pre-loaded. Status.BufferedLines[PRE_OR_POST_LINES] is the current line, // while the other array elements are earlier and later lines from the file. int IsHeader (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum, AnalysisDataset * Dataset, char *Line) { int RetVal; if (Dataset->ForceNeural) RetVal = IsHeaderNeural (BufferedLines, LastWasHeader1, LineNum, Dataset, Line); else RetVal = IsHeaderHeuristic (BufferedLines, LastWasHeader1, LineNum, Dataset, Line); return (RetVal); } //--------------------------------------------------------------------- // Tests if a character is appropriate for end-of-phrase punctuation. int IsEndPunct (char c) { return (c == '.' || c == '?' || c == '!' || c == ':' || c == ';' || c == ')' || c == ',' || c == '}' || c == ']'); } int IsEndSpace (char c) { return (!c || isspace (c)); } //--------------------------------------------------------------------- // The method of analysis uses several passes. The first pass collects // data about all the lines of the file. The last pass advances through // the file a line at a time, keeping in memory a window of the // pre-collected line data in which several lines prior to the curent line // and several lines after the current line are available for quick access. // This might allow, for example, detecting a line preceded and followed // by a couple of blank lines as being a header. int MarkBody (AnalysisDataset * Dataset) { MarkStatus Status = { 0 }; char s[256]; int ReturnValue = 1; LineRecord DefaultLine = { 0 }; int i, j; Status.BlockIndentation = 0; Status.TripSquote = -1; Status.SentenceStart = 1; Status.ParagraphType = MarkBeginJustifiedParagraph; Dataset->LineFile = tmpfile (); if (Dataset->LineFile == NULL) { fprintf (stderr, "Cannot create first-pass temporary file.\n"); ReturnValue = 2; goto Done; } //--------------------------------------------------------------------- // Run the first analysis pass. if (0 != (Status.ErrorCode = LineAnalysisPass (Dataset))) { fprintf (stderr, "Line-analysis failed (code = 0x%X).\n", Status.ErrorCode); ReturnValue = 3; goto Done; } //---------------------------------------------------------------------- // Another analysis pass that attempts to determine where the title-page & // contents are. In other words, the area between the PG header and the // actual text. if (Dataset->ForceNeural) PrefatoryAnalysisPassNeural (Dataset); else PrefatoryAnalysisPassHeuristic (Dataset); Dataset->NumPrefatoryMatched = 0; //---------------------------------------------------------------------- // Run the Final analysis pass. // Set up the data structures. s[sizeof (s) - 1] = '\0'; fseek (Dataset->LineFile, 0, SEEK_SET); fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); DefaultLine.OutOfRange = 1; DefaultLine.Empty = 1; for (i = 0; i < PRE_OR_POST_LINES; i++) Status.BufferedLines[i] = DefaultLine; for (; i < BUFFERED_LINES; i++) { if (1 != fread (Status.BufferedLines + i, sizeof (LineRecord), 1, Dataset->LineFile)) Status.BufferedLines[i] = DefaultLine; } Status.InParagraph = Status.InBlockquote = Status.InHeader1 = Status.LastWasHeader1 = Status.InSubtitle = Status.InTable = Status.Versifying = Status.LastVersifying = 0; Status.LineInfo = &Status.BufferedLines[PRE_OR_POST_LINES]; // Okay, here's where we actually loop through the lines of the input file. for (i = 0; i < Dataset->NumLines; i++) { // Fetch the current line of the input file. The window of buffered // LineRecord data is already set up properly. if (NULL == rfgets (s, sizeof (s) - 1, Dataset->InputFile)) { fprintf (stderr, "Premature end of input file.\n"); ReturnValue = 4; goto Done; } // Now do stuff related to the entire line: if (Dataset->ForceNeural) j = MarkByLineNeural (Dataset, &Status, i, s); else j = MarkByLineHeuristic (Dataset, &Status, i, s); switch (j) { case 5: goto DiskError; case -1: goto AtEnd; case 0: break; default: return (-1); // Unknown error type. } // Now do character-by-character and word-by-word stuff. if (MarkByChar (Dataset, &Status, s)) goto DiskError; // *** All done with this line! *** //DoneThisLine: // Advance the LineRecord window. Note, if it isn't obvious, that // it's okay for the window to move past the end of file, since // it is filled with default blank lines in this case. for (j = 0; j < BUFFERED_LINES - 1; j++) Status.BufferedLines[j] = Status.BufferedLines[j + 1]; if (1 != fread (Status.BufferedLines + j, sizeof (LineRecord), 1, Dataset->LineFile)) Status.BufferedLines[j] = DefaultLine; } AtEnd: // We're at the end, but may still be in a heading, paragraph, etc. if (Status.Italicizing) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndItalics, 0)) goto DiskError; } if (Status.InSubtitle) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndSubtitle, 0)) goto DiskError; } if (Status.InTable) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndTable, 0)) goto DiskError; } if (Status.InParagraph) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndParagraph, 0)) goto DiskError; } if (Status.InBlockquote) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndBlockquote, 0)) goto DiskError; } if (Status.InHeader1) { if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndHeader1, 0)) goto DiskError; } if (Status.LineInfo[0].Offset != 0) if (AddMarkup (Dataset, Status.LineInfo[0].Offset, MarkGutenbergEnder, 0)) goto DiskError; ReturnValue = 0; Done:fclose (Dataset->LineFile); AddMarkup (Dataset, -1, 0, 0); // Flush the buffer. return (ReturnValue); DiskError:fprintf (stderr, "Disk-write error.\n"); ReturnValue = 5; goto Done; }