///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: MarkByLineHeuristic.c Purpose: This handles just things high-level aspects of the line-level analysis, using data provided from LineAnalysisPass. In other words, it does things like detect headings, paragraph starts, paragraph ends, verse, etc., and supplies the appropriate markup. Mods: 12/30/01 RSB Split off from MarkBody.c, where it was formerly a large chunk of the MarkBody function. This function is (functionally) equivalent to the MarkByLineNeural function. It embodies the original approach I used in GutenMark. However, while it doesn't do a BAD job, in many ways, I soon came to realize that a different approach based on neural nets might do a better job. Therefore, there are two functionally equivalent functions, though hopefully not identical in performance, for the same job. Which one is actually used depends on command-line switches. */ #include #include #include #include #include "AutoMark.h" //---------------------------------------------------------------------- // After a certain number of these headings match the ones found in the // prefatory area, we no longer allow any new headings that DON'T match // unless they are pretty convincing in some other way. static int NowMatching (AnalysisDataset * Dataset, char *s, LineRecord * Line) { int InHeader1 = 1, j; j = MatchesPrefatoryLine (Dataset, s); if (Dataset->NumPrefatoryMatched > PREFATORY_MATCH_CEILING) { if (!j && !MatchSpecialWords (Dataset, s, Line) && !Line->AllCaps && !Line->ReallyShort) InHeader1 = 0; } else { if (j) (Dataset->NumPrefatoryMatched)++; } return (InHeader1); } //-------------------------------------------------------------------- // Detect a separating line, such as a line of all '*' or '-'. static int IsSeparatorLine (char *s, LineRecord * NextLine) { char *ss; if (NextLine->Empty) { // First, check for all stars. for (ss = s; *ss; ss++) if (!isspace (*ss) && *ss != '*') break; if (*ss == 0) return (1); // Now, check for all dashes. for (ss = s; *ss; ss++) if (!isspace (*ss) && *ss != '-') break; if (*ss == 0) return (1); // ... and underlines. for (ss = s; *ss; ss++) if (!isspace (*ss) && *ss != '_') break; if (*ss == 0) return (1); // .. and dots. . for (ss = s; *ss; ss++) if (!isspace (*ss) && *ss != '.') break; if (*ss == 0) return (1); } return (0); } //--------------------------------------------------------------------- // As complex as this function is, at base it's trying to do something // pretty simple: Just to locate headings, starts and ends of paragraphs, // versified areas, etc. // Returns: // 0 Success // 5 Disk error // -1 At end of file int MarkByLineHeuristic (AnalysisDataset * Dataset, MarkStatus * Status, int LineNum, char *s) { char *ss; int j, k, n; Status->FirstWordArea = Status->LastWasHeader1; // 12/09/01 RSB. Status->InPreface = (LineNum < Dataset->LowestNonPrefatoryLine); if (Status->LineInfo[0].Empty) // 12/09/01 RSB Status->SentenceStart = 1; // 12/09/01 RSB // Analyze. In case it isn't obvious, Status->LineInfo[0] contains the // LineRecord for the current line. Status->LineInfo[-1], ..., // Status->LineInfo[-PRE_OR_POST_LINES] are for the prior lines. // Status->LineInfo[1], ..., Status->LineInfo[PRE_OR_POST_LINES] // are for the succeeding lines. // Locate PG file-ender. We define this as any line within // 500 characters of the file-end that contains the words // "end", and "project gutenberg" or "PG". if (ftell (Dataset->InputFile) > Dataset->FileEnderRegion) { int End = 0, PG = 0; for (ss = s; *ss; ss++) { if (!strncasecmp (ss, "end ", 4)) { End = 1; if (PG) break; } else if (!strncasecmp (ss, " PG ", 4) || !strncasecmp (ss, "project gutenberg", 17)) { PG = 1; if (End) break; } } if (End && PG) goto AtEnd; } // *** Do stuff pertaining to the line properties, such as // paragraph begin/end, headers, and verse *** if (Status->InBlockquote) { if (Status->LineInfo[0].Empty || !Status->LineInfo[0].BeginsWhite) { Status->InBlockquote = 0; Status->BlockIndentation = 0; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndBlockquote, 0)) goto DiskError; } // 12/16/01 RSB. The following conditionals handle the first or // last lines of block quotes, if indented differently. else if (!Status->LineInfo[-1].Empty && Status->LineInfo[-1].Leading > Status->BlockIndentation) { if (AddMarkup (Dataset, Status->LastFirstSpace, MarkBreak, 0)) goto DiskError; } else if (Status->LineInfo[0].Leading > Status->BlockIndentation && !Status->LineInfo[-1].Empty) { if (AddMarkup (Dataset, Status->LastFirstSpace, MarkBreak, 0)) goto DiskError; for (j = Status->BlockIndentation; j < Status->LineInfo[0].Leading; j++) if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkNbsp, 0)) goto DiskError; } } if (Status->InSubtitle) { if (Status->LineInfo[0].Empty) { Status->InSubtitle = 0; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndSubtitle, 0)) goto DiskError; } } if (Status->InTable) { if (Status->LineInfo[0].Empty) { Status->InTable = 0; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndTable, 0)) goto DiskError; } } if (Status->InParagraph) { if (Status->LineInfo[0].Empty /* || Status->LineInfo[0].BeginsWhite */ ) { // End the paragraph. For safety's sake, assume that // italicizing doesn't cross paragraphs, if it's of the // delimited kind. An exception for the delimited kind // is if was used. We assume in that case // that the user had enough on the ball to get it right. if (Status->Italicizing != 0 && Status->Italicizing != 'A' && Status->Italicizing != 'H') { Status->Italicizing = 0; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndItalics, 0)) goto DiskError; } Status->InParagraph = 0; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndParagraph, 0)) goto DiskError; } else if (Status->Versifying) { if (Status->LineInfo[0].VerseCap || !Status->LineInfo[0].BeginsWhite || (!Status->LineInfo[0].VerseCap && Status->LineInfo[0].Leading > 1)) { if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkBreak, 0)) goto DiskError; for (j = Status->BlockIndentation; j < Status->LineInfo[0].Leading; j++) if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkNbsp, 0)) goto DiskError; } } else if (Status->LineInfo[-1].Leading > 1 || Status->LineInfo[0].Leading > 1 || (Status->LineInfo[-1].Leading == 1 && Status->LineInfo[0].Leading == 1) || (Status->LineInfo[0].Leading == 1 && !Status->LineInfo[-1].Empty && Status->LineInfo[-2].Leading == 1) || (Status->LineInfo[0].Leading == 1 && !Status->LineInfo[1].Empty && Status->LineInfo[2].Leading == 1) || (Status->LineInfo[-1].Leading == 1 && !Status->LineInfo[-2].Empty && Status->LineInfo[-3].Leading == 1) || (Status->LineInfo[-1].Leading == 1 && !Status->LineInfo[0].Empty && Status->LineInfo[1].Leading == 1)) { // The point of this to to detect intentionally indented lines // in the middle of a paragraph. But we assume that a single // line with a single space at the front -- isolated, without // other such lines in the vicinity -- is a simple mistake. // However, some verse has every OTHER line indented, so we // try to detect that. if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkBreak, 0)) goto DiskError; if (!Status->Centering) { for (j = Status->BlockIndentation; j < Status->LineInfo[0].Leading; j++) if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkNbsp, 0)) goto DiskError; } } else if (LineNum < Dataset->LowestNonPrefatoryLine) { if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkBreak, 0)) goto DiskError; } else if (!Status->LineInfo[-1].Empty && Status->LineInfo[-1].Short && (Status->LineInfo[-2].Empty || Status->LineInfo[-2].Short) && (Status->LineInfo[0].Short || (Status->LineInfo[1].Empty || Status->LineInfo[1].Short)) && Status->ParagraphType != MarkBeginJustifiedParagraph) { if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkBreak, 0)) goto DiskError; } else if (Status->LineInfo[0].VerseCap) { // Hopefully, what with the "Status->Versifying" variable (which // didn't exist previously), this code shouldn't be needed. // But heck, it was here! What we do here is to try and // detect a line of verse which is nevetheless longer than // the arbitrary limit of Status->LineInfo[0].Short. (This is // actually a pretty rare condition, but heck!) // We do this by noting that lines of verse are generally // initially capitalized, so we look for a whole block of // lines that meet this criterion. k = 1; for (j = -1; j >= -PRE_OR_POST_LINES; j--) if (Status->LineInfo[j].VerseCap) k++; else if (!Status->LineInfo[j].Empty) break; if (j >= -PRE_OR_POST_LINES && !Status->LineInfo[j].Empty) k = -1000; for (j = 1; j <= PRE_OR_POST_LINES; j++) if (Status->LineInfo[j].VerseCap) k++; else if (!Status->LineInfo[j].Empty) break; if (j <= PRE_OR_POST_LINES && !Status->LineInfo[j].Empty) k = -1000; if (k >= 4) { if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkBreak, 0)) goto DiskError; } } } if (Status->InHeader1 && Status->LineInfo[-1].Empty) { Status->InHeader1 = 0; Status->LastWasHeader1 = 1; if (AddMarkup (Dataset, Status->LastFirstSpace /* Status->LineInfo[0].Offset */ , MarkEndHeader1, 0)) goto DiskError; } if (!Status->InParagraph && !Status->InBlockquote && !Status->InHeader1 && !Status->InSubtitle && !Status->InTable && !Status->LineInfo[0].Empty) { // Status->LastVersifying is a variable that grows slowly while versifying, // but decays quickly when not. It allows a sort of persistence of // detecting verse when intervening stuff is thrown in (like // footnotes, dividers "****", and so on). if (Status->Versifying) Status->LastVersifying++; else if (Status->LastWasHeader1) Status->LastVersifying--; else Status->LastVersifying /= 2; Status->Versifying = 0; // If we find stuff we know is going to be uninterpretable, go // into pre-formatted mode. if (Status->LineInfo[0].WeirdSequences || Status->LineInfo[1].WeirdSequences) { Status->InTable = 1; if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkBeginTable, 0)) goto DiskError; goto ParagraphBegun; } if (Status->LineInfo[0].BeginsWhite) { // If the line began with whitespace, we never count it as a // potential heading, unless it matches certain string patterns. j = Status->LastWasHeader1; if (LineNum > Dataset->LowestNonPrefatoryLine) if (MatchesPrefatoryLine (Dataset, s) || MatchSpecialWords (Dataset, s, &Status->LineInfo[0])) { if (IsHeader (Status->BufferedLines, &Status->LastWasHeader1, LineNum, Dataset, s)) { if (NowMatching (Dataset, s, &Status->LineInfo[0])) goto StartHeader1; } else if (j) { j = 0; if (MatchesPrefatoryLine (Dataset, s) || IsHeader (Status->BufferedLines, &j, LineNum, Dataset, s)) { BeginSubtitle:Status->InSubtitle = 1; for (ss = s; isspace (*ss); ss++); if (AddMarkup (Dataset, Status->LineInfo[0].Offset + ss - s, MarkBeginSubtitle, 0)) goto DiskError; goto ParagraphBegun; } } } // Here we try to detect block quotes. We do this by // determining that all of the lines are indented the // same amount, and that they don't begin all capitalized. k = Status->LineInfo[0].Leading; if (Status->LineInfo[0].BeginsQuote && Status->LineInfo[1].Leading == k + 1) k++; n = Status->LineInfo[0].VerseCap; for (j = 1; j <= PRE_OR_POST_LINES; j++) { if (Status->LineInfo[j].Empty) break; if (Status->LineInfo[j].Leading != k) break; n += Status->LineInfo[j].VerseCap; } if ((j > PRE_OR_POST_LINES || Status->LineInfo[j].Empty) && j > 3 && n != j) { Status->InBlockquote = 1; Status->BlockIndentation = k; if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkBlockquote, 0)) goto DiskError; goto ParagraphBegun; } // Well, not recognized as a block quote. Status->Type = MarkBeginRaggedParagraph; StartParagraph:if (!Status->Versifying) { // Is it the start of a table? We'd LIKE to detect // these simply as paragraphs all of whose lines are // "PossibleTable" data. However, it's possible that the // column-headers might occupy multiple rows, and if one // of the column-headers is one or more lines longer than // the others, it can't be recognized as tabular data. // So we have to count these specially. For tables // containing blank lines -- well, we're just out of // luck there. for (j = 0, k = -1; j <= PRE_OR_POST_LINES; j++) if (Status->LineInfo[j].Empty) break; else if (!Status->LineInfo[j].PossibleTable) { if (k == j - 1) k++; else break; } if (j > k + 2 && (j > PRE_OR_POST_LINES || Status->LineInfo[j].Empty)) { Status->InTable = 1; if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkBeginTable, 0)) goto DiskError; goto ParagraphBegun; } } // No, just a regular paragraph. Status->InParagraph = 1; // Try to detect centered text. There are two conditions we // apply. First, at least one line in the current contiguous // group is ragged. Second, we count the number of ragged lines // in the vicinity, and determine that the count is greater than // a certain number. An exception to the latter is if every // line in the contiguous group is ragged. Lastly, if the // paragraph contains a single line, the RaggedStart criterion // is too limiting, so in that case we look for any white space. // All of this applies only if the sole line that's indented is // NOT the first line of the paragraph. Oh, and don't center // verse: if (Status->Versifying) { Status->Type = MarkBeginRaggedParagraph; goto TypeKnown; } // Here we detect a simple indented paragraph -- we think! if (Status->LineInfo[0].BeginsWhite && !Status->LineInfo[1].Empty) { for (j = 1; j <= PRE_OR_POST_LINES; j++) if (Status->LineInfo[j].Empty || Status->LineInfo[j].BeginsWhite) break; if (j > PRE_OR_POST_LINES || Status->LineInfo[j].Empty) { // Okay, it's just an indented paragraph. // Go to the regular // analysis. Status->LineInfo[0].BeginsWhite = 0; Status->LineInfo[0].Leading = 0; goto RegularParagraph; } } // Here we look for a line whose purpose is simply to // separate blocks of text. if (IsSeparatorLine (s, &Status->LineInfo[1])) { Status->Type = MarkBeginCenteredParagraph; goto TypeKnown; } // Here we look for a single contiguous group of lines // that is ragged. for (k = j = 0; j < PRE_OR_POST_LINES; j++) { if (Status->LineInfo[j].Empty) break; if (Status->LineInfo[j].RaggedStart || (j == 0 && Status->LineInfo[j].BeginsWhite)) k++; } if (k == j) // Here, EVERY line was ragged. { // One problem is that the set of contiguous lines // might have been really short -- maybe just one line. // Maybe it was just an indented paragraph, if it's only // one line. In that case, we check the vicinity for // additional ragged lines, unless the indentation is // really big! if (j == 1) { if (Status->LineInfo[0].Leading > 10) Status->Type = MarkBeginCenteredParagraph; else goto MoreRaggedChecking; } else Status->Type = MarkBeginCenteredParagraph; } else if (k > 0) // Here, just SOME lines were ragged. { MoreRaggedChecking: // We check additional lines from the vicinity, but // possibly outside the contiguous group, to see if // they're ragged too. for (k = 0, j = -PRE_OR_POST_LINES; j <= PRE_OR_POST_LINES; j++) { if (Status->LineInfo[j].RaggedStart || (j == 0 && Status->LineInfo[j].BeginsWhite)) k++; } if (k >= 2) Status->Type = MarkBeginCenteredParagraph; } TypeKnown: // 12/16/01 RSB. The label above implies that the type // of paragraph we want to begin is now known. But I // discovered a bug in which if you have a block quote // (i.e., all lines indented by the same amount) EXCEPT // with the first/last lines indented differently, then // it would be erroneously treated as a centered region. // We therefore have to intercept this case. if (Status->Type == MarkBeginCenteredParagraph) { // Count the number of lines in this block. for (j = 0; j <= PRE_OR_POST_LINES; j++) if (Status->LineInfo[j].Empty) break; // We need a certain number of lines to work with. if (j >= 4) { int NewIndentation; NewIndentation = Status->LineInfo[2].Leading; // Treat the first two lines and last line differently. // If they meet the criteria, then check the intervening // lines if ((Status->LineInfo[0].Leading >= NewIndentation || (Status->LineInfo[0].BeginsQuote && Status->LineInfo[0].Leading == NewIndentation - 1)) && Status->LineInfo[j - 1].Leading >= NewIndentation && (Status->LineInfo[1].Leading == NewIndentation || (Status->LineInfo[1].BeginsQuote && Status->LineInfo[1].Leading == NewIndentation - 1))) { for (k = 3; k < j - 1; k++) if (Status->LineInfo[k].Leading != NewIndentation) break; // Finally, meets all of the criteria for a // block quote! if (k == j - 1) { Status->BlockIndentation = NewIndentation; Status->Type = MarkBlockquote; Status->ParagraphType = MarkBlockquote; Status->Centering = 0; if (AddMarkup (Dataset, Status->LineInfo[0].Offset, Status->ParagraphType, 0)) goto DiskError; Status->InBlockquote = 1; for (j = Status->BlockIndentation; j < Status->LineInfo[0].Leading; j++) if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkNbsp, 0)) goto DiskError; goto ParagraphBegun; } } } } // Okay, now we REALLY DO know what type we're using. Status->Centering = (Status->Type == MarkBeginCenteredParagraph); Status->ParagraphType = Status->Type; if (AddMarkup (Dataset, Status->LineInfo[0].Offset, Status->ParagraphType, 0)) goto DiskError; if (!Status->Centering) { //for (ss = s; isspace (*ss); ss++) for (j = Status->BlockIndentation; j < Status->LineInfo[0].Leading; j++) if (AddMarkup (Dataset, Status->LineInfo[0].Offset, MarkNbsp, 0)) goto DiskError; } Status->InParagraph = 1; ParagraphBegun:; } else { if (LineNum > 0 && LineNum == Dataset->LowestNonPrefatoryLine) Status->InHeader1 = 1; else if (LineNum < Dataset->LowestNonPrefatoryLine) Status->InHeader1 = 0; else { j = Status->LastWasHeader1; Status->InHeader1 = IsHeader (Status->BufferedLines, &Status->LastWasHeader1, LineNum, Dataset, s); if (j && !Status->InHeader1) { j = 0; if (MatchesPrefatoryLine (Dataset, s) || IsHeader (Status->BufferedLines, &j, LineNum, Dataset, s)) goto BeginSubtitle; } } if (Status->InHeader1) Status->InHeader1 = NowMatching (Dataset, s, &Status->LineInfo[0]); if (Status->InHeader1) { StartHeader1:Status->InHeader1 = 1; // NOTE THAT THE FOLLOWING MARKUP HAS THE POTENTIAL TO BE // OUT OF ORDER IF SOME TYPES OF CODE CHANGES OCCUR LATER! for (ss = s; isspace (*ss); ss++); if (AddMarkup (Dataset, Status->LineInfo[0].Offset + ss - s, MarkHeader1, 0)) goto DiskError; } else { RegularParagraph: // Here's where the analysis starts for a plain-Jane // unindented Gutenberg paragraph. // We know that we want to start a paragraph, but we don't // know if it should be justified or ragged. // Here we look for a line whose purpose is simply to // separate blocks of text. if (IsSeparatorLine (s, &Status->LineInfo[1])) { Status->Type = MarkBeginCenteredParagraph; goto TypeKnown; } // Let's check to see if the lines seem to be shorter // than expected. Note that we count some conditions // twice as much as those that are simply short. // We do it intentionally. if (LineNum < Dataset->LowestNonPrefatoryLine) Status->Type = MarkBeginRaggedParagraph; else { // Here we look for long chunks of obvious verse, // which we detect as complete paragraphs (to the // extent we can determine such a thing), // every line of which begins with caps. However, // we reject short paragraphs with lines beginning // with quotes from the count. for (j = k = 0; j < PRE_OR_POST_LINES; j++) if (Status->LineInfo[j].Empty) break; else if (Status->LineInfo[j].VerseCap) k++; if (k == j) { if (j > 5) Status->Versifying = 1; else if (j > 3 && Status->LastVersifying) Status->Versifying = 1; else if (j > 2 && Status->LastVersifying > 2) Status->Versifying = 1; } // Sort of a catch-all heuristic thing to detect verse // that wasn't so obvious as the above. if (!Status->Versifying) for (j = 0, k = (Status->LastVersifying > 0); j < PRE_OR_POST_LINES; j++) { if (Status->LineInfo[j].Empty) break; if (Status->LineInfo[j].BeginsWhite || (Status->LineInfo[j].ReallyShort && !Status->LineInfo[j + 1].Empty && Status->LineInfo[j].VerseCap)) k += 2; else if (Status->LineInfo[j].Short && !Status->LineInfo[j + 1].Empty && Status->LineInfo[j].VerseCap) k++; if (!Status->LineInfo[j].BeginsWhite && !Status->LineInfo[j].VerseCap) k--; } // The above rules are fine, but OCCASIONALLY // catch a line of dialog, because when there's // dialog there are usually a lot of 1- or 2-line // paragraphs, possibly short, all beginning with // caps. The line below just reduces that tendency // a little. if (j <= 2 && !Status->LineInfo[0].BeginsWhite && (Status->LineInfo[0].BeginsQuote || Status->LineInfo[-2].BeginsQuote || Status->LineInfo[2].BeginsQuote || (Status->LineInfo[2].Empty && Status->LineInfo[3].BeginsQuote) || Status->LineInfo[1].Empty)) k--; if (Status->Versifying || (k >= 2 && k >= (j - 1) / 2) || (j == 1 && Status->LineInfo[0].Short) || k == j || k == (j - 1)) { Status->Versifying = 1; Status->Type = MarkBeginRaggedParagraph; } else Status->Type = MarkBeginJustifiedParagraph; } goto StartParagraph; } } } return (0); DiskError: return (5); AtEnd: return (-1); }