/////////////////////////////////////////////////////////////////////////////
/*
Copyright 2001 Ronald S. Burkey
This file is part of GutenMark.
GutenMark is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
GutenMark is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GutenMark; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Filename: MarkBody.c
Purpose: This analyzes the body of the text.
Mods: 08/31/01 RSB Began.
09/03/01 RSB N, S, E, and W added to honorifics (really
street names, but works the same).
09/04/01 RSB Now replace '<' & '>' & '&', since they
can be mistaken for markup. Add closing
markup on reaching end of file.
09/06/01 RSB Process paragraphs to try and deduce
whether to make them justified or ragged.
09/08/01 RSB Lotsa cleanup and tweaking.
09/15/01 RSB Now account for the ~~ italics delimiters.
Also, the /italics/ format.
Added more heuristics for detecting verse.
11/02/01 RSB Added GPL disclaimer and reformatted
somewhat for first web release.
11/09/01 RSB Now convert all strings of dashes of
length 2 or more to mdashes and ndashes.
In other words, "--" converts to —
"---" converts to —–
"----" converts to —— and so on.
These don't look good in html, but look
great in postscript or pdf. Also, the
dash in " - " is converted to an mdash.
11/12/01 RSB Added detection of the PG file-ender.
11/13/01 RSB Added smart quotes.
11/23/01 RSB Added ALL-CAPs and foreign italicizing.
11/24/01 RSB Centralized all of the markup-file output
operations into the AddMarkup function.
8-bit ASCII restoration.
11/25/01 RSB Resolved conflicts between 8-bit restoration
and ALL-CAPS/foreign markup.
12/09/01 RSB Fixed bug in which an ALL-CAPS word
at the beginning of a section (or other
odd circumstance) might be treated as being
in the middle of a sentence. Began adding
support for --first-italics and
--first-capital.
12/10/01 RSB Modified Nbsp markup after colon or between
sentences with SingleSpace. Added some
HTML beautification: The stuff involving
Status.LastFirstSpace is all used to put closing
tags for things like paragraphs, headings,
etc. BEFORE newlines rather than after them.
12/11/01 RSB Fixed that hanging
that was always being
stuck at the file end due to the unused
PG "end" line. When backtracking through
foreign words, it was possible to double-
italicize words that had already been
italicized due to being ALL-CAPS. The
ALL-CAPS condition now terminates the
backtrack list.
12/13/01 RSB Added WeirdSequences, LikelyName, and
WantFirstCharUpper.
12/15/01 RSB Split off MarkByChar from MarkBody to make
the code look a little simpler.
12/16/01 RSB Added additional code to the centering
recognizer to distinguish block quotes
whose first and/or last lines are simply
indented differently. Also, added code
to the blockquote processor for this case.
12/27/01 RSB Fixed a bug in which the markup for the
PG ender would be at offset 0 if there was
no header.
12/30/01 RSB Split MarkByLineHeuristic from the
MarkBody function, to make the heuristic
approach more easily replaceable by a
neural-net approach.
01/01/02 RSB Moved AddMarkup function to its own
source file. Split IsHeader into
IsHeaderNeural and IsHeaderHeuristic, each in
its own soard file. Similarly, split
PrefatoryAnalysisPass function.
*/
#include
#include
#include
#include
#include "AutoMark.h"
//-----------------------------------------------------------------------
// Normalizes a potential heading string in-place. Returns the length of
// the normalized string.
int
NormalizePotentialHeader (char *s)
{
char *ss;
int j;
// Normalize the line by removing leading spaces.
for (ss = s; isspace (*ss); ss++);
if (ss != s)
strcpy (s, ss);
// Also, remove any trailing spaces or punctuation.
j = strlen (s);
if (j > 0)
{
for (ss = &s[j - 1]; ss >= s; ss--)
{
if (isspace (*ss) || ispunct (*ss))
*ss = 0;
else
break;
}
}
// Get rid of multiple spaces.
for (ss = s; *ss; ss++)
if (isspace (*ss))
{
*ss = ' ';
if (isspace (ss[1]))
{
strcpy (ss, ss + 1);
ss--;
}
}
// Oh, and let's make it upper-case.
for (ss = s; *ss; ss++)
*ss = toupper (*ss);
j = strlen (s);
return (j);
}
//---------------------------------------------------------------------------
// Checks a line to see if it's a match against the buffered prefatory lines.
int
MatchesPrefatoryLine (AnalysisDataset * Dataset, char *Line)
{
char s[256];
int i;
if (NULL != Line)
{
strcpy (s, Line);
NormalizePotentialHeader (s);
for (i = 0; i < Dataset->NumNonTrivialPrefatoryLines; i++)
if (0 ==
strncmp (s, Dataset->PrefatoryLines[i],
Dataset->PrefatoryLineSizes[i]) && strlen (s) <
2 * Dataset->PrefatoryLineSizes[i])
return (1);
}
return (0);
}
//------------------------------------------------------------------------
// This function is used to recognize headings. Returns 0 if it thinks the
// line is not a header. The array Status.BufferedLines[BUFFERED_LINES] should have
// been pre-loaded. Status.BufferedLines[PRE_OR_POST_LINES] is the current line,
// while the other array elements are earlier and later lines from the file.
int
IsHeader (LineRecord * BufferedLines, int *LastWasHeader1, int LineNum,
AnalysisDataset * Dataset, char *Line)
{
int RetVal;
if (Dataset->ForceNeural)
RetVal =
IsHeaderNeural (BufferedLines, LastWasHeader1, LineNum, Dataset, Line);
else
RetVal =
IsHeaderHeuristic (BufferedLines, LastWasHeader1, LineNum, Dataset,
Line);
return (RetVal);
}
//---------------------------------------------------------------------
// Tests if a character is appropriate for end-of-phrase punctuation.
int
IsEndPunct (char c)
{
return (c == '.' || c == '?' || c == '!' || c == ':' || c == ';'
|| c == ')' || c == ',' || c == '}' || c == ']');
}
int
IsEndSpace (char c)
{
return (!c || isspace (c));
}
//---------------------------------------------------------------------
// The method of analysis uses several passes. The first pass collects
// data about all the lines of the file. The last pass advances through
// the file a line at a time, keeping in memory a window of the
// pre-collected line data in which several lines prior to the curent line
// and several lines after the current line are available for quick access.
// This might allow, for example, detecting a line preceded and followed
// by a couple of blank lines as being a header.
int
MarkBody (AnalysisDataset * Dataset)
{
MarkStatus Status = { 0 };
char s[256];
int ReturnValue = 1;
LineRecord DefaultLine = {
0
};
int i, j;
Status.BlockIndentation = 0;
Status.TripSquote = -1;
Status.SentenceStart = 1;
Status.ParagraphType = MarkBeginJustifiedParagraph;
Dataset->LineFile = tmpfile ();
if (Dataset->LineFile == NULL)
{
fprintf (stderr, "Cannot create first-pass temporary file.\n");
ReturnValue = 2;
goto Done;
}
//---------------------------------------------------------------------
// Run the first analysis pass.
if (0 != (Status.ErrorCode = LineAnalysisPass (Dataset)))
{
fprintf (stderr, "Line-analysis failed (code = 0x%X).\n",
Status.ErrorCode);
ReturnValue = 3;
goto Done;
}
//----------------------------------------------------------------------
// Another analysis pass that attempts to determine where the title-page &
// contents are. In other words, the area between the PG header and the
// actual text.
if (Dataset->ForceNeural)
PrefatoryAnalysisPassNeural (Dataset);
else
PrefatoryAnalysisPassHeuristic (Dataset);
Dataset->NumPrefatoryMatched = 0;
//----------------------------------------------------------------------
// Run the Final analysis pass.
// Set up the data structures.
s[sizeof (s) - 1] = '\0';
fseek (Dataset->LineFile, 0, SEEK_SET);
fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET);
DefaultLine.OutOfRange = 1;
DefaultLine.Empty = 1;
for (i = 0; i < PRE_OR_POST_LINES; i++)
Status.BufferedLines[i] = DefaultLine;
for (; i < BUFFERED_LINES; i++)
{
if (1 !=
fread (Status.BufferedLines + i, sizeof (LineRecord), 1,
Dataset->LineFile))
Status.BufferedLines[i] = DefaultLine;
}
Status.InParagraph = Status.InBlockquote = Status.InHeader1 =
Status.LastWasHeader1 = Status.InSubtitle = Status.InTable =
Status.Versifying = Status.LastVersifying = 0;
Status.LineInfo = &Status.BufferedLines[PRE_OR_POST_LINES];
// Okay, here's where we actually loop through the lines of the input file.
for (i = 0; i < Dataset->NumLines; i++)
{
// Fetch the current line of the input file. The window of buffered
// LineRecord data is already set up properly.
if (NULL == rfgets (s, sizeof (s) - 1, Dataset->InputFile))
{
fprintf (stderr, "Premature end of input file.\n");
ReturnValue = 4;
goto Done;
}
// Now do stuff related to the entire line:
if (Dataset->ForceNeural)
j = MarkByLineNeural (Dataset, &Status, i, s);
else
j = MarkByLineHeuristic (Dataset, &Status, i, s);
switch (j)
{
case 5:
goto DiskError;
case -1:
goto AtEnd;
case 0:
break;
default:
return (-1); // Unknown error type.
}
// Now do character-by-character and word-by-word stuff.
if (MarkByChar (Dataset, &Status, s))
goto DiskError;
// *** All done with this line! ***
//DoneThisLine:
// Advance the LineRecord window. Note, if it isn't obvious, that
// it's okay for the window to move past the end of file, since
// it is filled with default blank lines in this case.
for (j = 0; j < BUFFERED_LINES - 1; j++)
Status.BufferedLines[j] = Status.BufferedLines[j + 1];
if (1 !=
fread (Status.BufferedLines + j, sizeof (LineRecord), 1,
Dataset->LineFile))
Status.BufferedLines[j] = DefaultLine;
}
AtEnd:
// We're at the end, but may still be in a heading, paragraph, etc.
if (Status.Italicizing)
{
if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndItalics, 0))
goto DiskError;
}
if (Status.InSubtitle)
{
if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndSubtitle, 0))
goto DiskError;
}
if (Status.InTable)
{
if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndTable, 0))
goto DiskError;
}
if (Status.InParagraph)
{
if (AddMarkup
(Dataset, ftell (Dataset->InputFile), MarkEndParagraph, 0))
goto DiskError;
}
if (Status.InBlockquote)
{
if (AddMarkup
(Dataset, ftell (Dataset->InputFile), MarkEndBlockquote, 0))
goto DiskError;
}
if (Status.InHeader1)
{
if (AddMarkup (Dataset, ftell (Dataset->InputFile), MarkEndHeader1, 0))
goto DiskError;
}
if (Status.LineInfo[0].Offset != 0)
if (AddMarkup (Dataset, Status.LineInfo[0].Offset, MarkGutenbergEnder, 0))
goto DiskError;
ReturnValue = 0;
Done:fclose (Dataset->LineFile);
AddMarkup (Dataset, -1, 0, 0); // Flush the buffer.
return (ReturnValue);
DiskError:fprintf (stderr, "Disk-write error.\n");
ReturnValue = 5;
goto Done;
}