///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: PrefatoryAnalysisPassHeuristic.c Purpose: Identifies the document's "prefatory area" using a heuristic method. An alternate drop-in replacement using a neural-net method may be selected instead at runtime. Mods: 01/01/02 RSB Moved and renamed from MarkBody.c. 06/15/02 RSB Corrected a shocking indexing error. How could this ever have worked? */ #include #include #include #include #include "AutoMark.h" //--------------------------------------------------------------------------- // Attempt to find the "prefatory" area, which is what I call the area between // the PG header and the actual text. The net result of the pass is simply to // set Dataset->NumPrefatoryLines. The basic technique is to find, within the // first MAX_PREFATORY_LINES the first candidate heading followed by at least // MIN_TEXT_LINES of non-headings. Obviously, some additional flourishes may // be required. For example, if we find a non-blank line that's a duplicate // of a prior line, it is probably the first section. // // The reason I'm doing this at all is that if the Gutenberg text contains a // "table of contents", we really messes things up in terms of extraneous // headings. So if I can find something labeled "contents" or "table of // contents", I'll include it even if it's outside of these parameters. void PrefatoryAnalysisPassHeuristic (AnalysisDataset * Dataset) { int i, j, k, TextLines, LastWasHeader1; LineRecord BufferedLines[BUFFERED_LINES], DefaultLine = { 0 }; char s[256], *ss; Dataset->LowestPrefatoryLine = 0; Dataset->LowestNonPrefatoryLine = 0; // See if we can find something labeled as the table of contents. // Everything in front of that will be included in the prefatory // area by default (but won't be buffered). s[sizeof (s) - 1] = 0; fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); for (i = 0; i < MAX_PREFATORY_LINES; i++) if (NULL != rfgets (s, sizeof (s) - 1, Dataset->InputFile)) { NormalizePotentialHeader (s); if (0 == strcmp (s, "CONTENTS") || 0 == strcmp (s, "TABLE OF CONTENTS") || 0 == strncmp (s, "CONTENTS OF", 11) || 0 == strncmp (s, "TABLE OF CONTENTS OF", 20)) { Dataset->LowestPrefatoryLine = Dataset->LowestNonPrefatoryLine = i; break; } } else break; // Load up the Status.BufferedLines array. DefaultLine.OutOfRange = 1; DefaultLine.Empty = 1; fseek (Dataset->LineFile, 0, SEEK_SET); for (i = 0; i < PRE_OR_POST_LINES; i++) BufferedLines[i] = DefaultLine; for (; i < BUFFERED_LINES; i++) { if (1 != fread (BufferedLines + i, sizeof (LineRecord), 1, Dataset->LineFile)) BufferedLines[i] = DefaultLine; } // Now do the actual search. TextLines = -MAX_PREFATORY_LINES; for (i = 0; i < MAX_PREFATORY_LINES; i++) { if (i >= Dataset->LowestPrefatoryLine) { LastWasHeader1 = 0; if (!BufferedLines[PRE_OR_POST_LINES].Contents && IsHeader (BufferedLines, &LastWasHeader1, i, Dataset, NULL)) { Dataset->LowestNonPrefatoryLine = i; TextLines = 0; } else TextLines++; if (TextLines > MIN_TEXT_LINES) break; } // Load the next line. for (j = 0; j < BUFFERED_LINES - 1; j++) BufferedLines[j] = BufferedLines[j + 1]; if (1 != fread (BufferedLines + BUFFERED_LINES - 1, sizeof (LineRecord), 1, Dataset->LineFile)) BufferedLines[j /* 06/15/02 RSB. Shockingly, was i! */] = DefaultLine; } if (TextLines <= MIN_TEXT_LINES) Dataset->LowestNonPrefatoryLine = Dataset->LowestPrefatoryLine; fseek (Dataset->LineFile, 0, SEEK_SET); // Now that the prefatory area has been located, buffer the actual lines // from it. But don't allow duplicates. If duplicates are found, // then shorten up the area. fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); s[sizeof (s) - 1] = 0; Dataset->NumNonTrivialPrefatoryLines = 0; for (i = 0; i < Dataset->LowestNonPrefatoryLine; i++) { rfgets (s, sizeof (s) - 1, Dataset->InputFile); if (i < Dataset->LowestPrefatoryLine) continue; j = NormalizePotentialHeader (s); if (j > 5) { // Here we have an a non-trivial line. Let's check to see if we've // already buffered it. for (k = 0; k < Dataset->NumNonTrivialPrefatoryLines; k++) if (0 == strcmp (s, Dataset->PrefatoryLines[k])) { // It's a duplicate! Dataset->LowestNonPrefatoryLine = i; goto DoneBuffering; } // Haven't buffered it yet, so it's okay to continue. ss = (char *) calloc (1, j + 1); if (ss != NULL) { Dataset->PrefatoryLines[Dataset->NumNonTrivialPrefatoryLines] = ss; strcpy (ss, s); Dataset->PrefatoryLineSizes[Dataset-> NumNonTrivialPrefatoryLines] = j; (Dataset->NumNonTrivialPrefatoryLines)++; } } } DoneBuffering:fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); }