///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: LineAnalysisPass.c Purpose: This pass collects useful info about the lines in the input file. This data is later combined heuristically to (hopefully) answer such questions as whether the line is a header, where paragraphs begin and end, and so forth. Mods: 09/01/01 RSB Began. 09/03/01 RSB Altered definition of all-caps to include McSOMETHING, MacSOMETHING, DeSOMETHING. 09/15/01 RSB ... and unfortunately introduced a bug in which the NEXT character was always being checked by IsLower! Fixed this, and allowed IsLower to take care of the lower-case characters in "8-bit" ASCII. Now allow things like "(A" to count as VerseCap. 11/02/01 RSB Added GPL disclaimer and reformatted somewhat for first web release. 12/13/01 RSB Added WeirdSequences. The lines which are analyzed are not part of any header located previously. In other words, we begin with "line 0" at Dataset->TextStart. */ #include #include #include #include "AutoMark.h" #ifdef __BORLANDC__ #define strncasecmp strncmpi #endif // Strings that may be at the fronts of names that don't affect the // capitalization status of the name. const char *NamePrefixes[] = { "Mac", "Mc", "De", "Di", "Da", "La", "Le", "Van", "Von" }; #define NUM_NAME_PREFIXES (sizeof (NamePrefixes) / sizeof (NamePrefixes[0])) int NamePrefixSizes[NUM_NAME_PREFIXES]; //---------------------------------------------------------------------- // This is a replacement for fgets, in which a form feed can terminate a // line. char * rfgets (char *s, int size, FILE * fp) { int ch; char *RetVal; RetVal = s; while (size) { ch = getc (fp); if (ch == EOF) { *s = 0; return (NULL); } if (ch == '\f') ch = '\n'; *s++ = ch; size--; if (ch == '\n') break; } if (size) *s = 0; return (RetVal); } //------------------------------------------------------------------------ // Checks if a character is one of the roman-numeral characters. // We don't check for 'D' or 'M' because our entire focus is on detecting // chapter numbers or book numbers. While there are books with over 100 // chapters, I don't think there are any or man with over 500. static int isroman (char c) { c = toupper (c); return (c == 'I' || c == 'V' || c == 'X' || c == 'L' || c == 'C'); } //----------------------------------------------------------------------- // Checks for characters unlikely to appear at the end of a header. // Admittedly, some of these characters can appear, but ... static int IsEndPeriod (char c) { return (ispunct (c) && c != ')' /* && c != '\"' && c != '\'' */ ); } //---------------------------------------------------------------------- // Checks for lower-case letter at position sss in string s. Exceptions // are made so that "McSOMETHING", "MacSOMETHING", "DeSOMETHING" and // maybe others aren't counted as lower case. Note that as given, the // code only works for prefixes that actually begin with an upper-case // letter followed by lower-case letter(s). #define IsPureUpper(c) ( \ ( ((unsigned char) (c)) >= 'A' && ((unsigned char) (c)) <= 'Z' ) \ || ( ((unsigned char) (c)) >= 192 && ((unsigned char) (c)) <= 222 ) \ ) static int IsLower (char *s, char **sss) { int i; char c; unsigned uch; uch = (unsigned char) **sss; if ((uch >= 'A' && uch <= 'Z') && ((*sss)[1] >= 'a' && (*sss)[1] <= 'z')) if (*sss == s || isspace ((*sss)[-1]) || (*sss)[-1] == '-') { for (i = 0; i < NUM_NAME_PREFIXES; i++) if (0 == strncmp (*sss, NamePrefixes[i], NamePrefixSizes[i])) { c = (*sss)[NamePrefixSizes[i]]; if (IsPureUpper (c)) { (*sss) += NamePrefixSizes[i] + 1; return (0); } } } (*sss)++; return ((uch >= 'a' && uch <= 'z') || (uch >= 223 && uch <= 255)); } //--------------------------------------------------------------------- // Detects that the input line begins with certain special words. static const char *SpecialWords[] = { "Book", "Chapter", "Chap", "Part", "footnotes", "footnotes:", "endnotes", "endnotes:", "index", "introduction", "preface", "foreward", "summary", "conclusion", "appendix", "prologue", "prolog", "epilogue", "epilog", "glossary", "about the author", "advertisement", "advertisement.", "synopsis", "executive summary", "recommendation", "recommendations", "letter", "story" }; #define NUM_SPECIALWORDS (sizeof (SpecialWords) / sizeof (SpecialWords[0])) static int SpecialWordSizes[NUM_SPECIALWORDS]; static int SpecialWordsInitialized = 0; int MatchSpecialWords (AnalysisDataset * Dataset, char *ss, LineRecord * Line) { int i; if (!SpecialWordsInitialized) { for (i = 0; i < NUM_SPECIALWORDS; i++) SpecialWordSizes[i] = strlen (SpecialWords[i]); SpecialWordsInitialized = 1; } for (; isspace (*ss); ss++); if (strncasecmp (ss, "contents", 8) == 0 || strncasecmp (ss, "table of contents", 17) == 0) Line->Contents = 1; for (i = 0; i < NUM_SPECIALWORDS; i++) if (0 == strncasecmp (ss, SpecialWords[i], SpecialWordSizes[i]) && isspace (ss[SpecialWordSizes[i]])) { Line->BeginsChapter = 1; return (1); } return (0); } //----------------------------------------------------------------------- // Returns 0 on success. int LineAnalysisPass (AnalysisDataset * Dataset) { unsigned long Offset; char s[AUTOMARK_MAX_LINESIZE], *ss, *sss, *siv; LineRecord Line, DefaultLine = { 0 }; int ReturnValue = 1; int i, Leading; // Set up the name-prefix array. for (i = 0; i < NUM_NAME_PREFIXES; i++) NamePrefixSizes[i] = strlen (NamePrefixes[i]); Offset = Dataset->TextStart; s[sizeof (s) - 1] = '\0'; fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); while (NULL != rfgets (s, sizeof (s) - 1, Dataset->InputFile)) { Line = DefaultLine; // First record the offset of the line. Line.Offset = Offset; Offset = ftell (Dataset->InputFile); // Now, miscellaneous other info. if (*s == '\0') Line.Empty = 1; else { // Check for possible tabular data. Recognized by multiple // whitespace between non-white characters. This is defined // as at least 3 whites, or at least 4 whites following // punctuation. for (ss = s; isspace (*ss); ss++); // ss -> first non-space. for (sss = ss + 1; *sss; sss++); // find trim end of line too. for (sss--; sss > s && isspace (*sss); sss--); // nonspace for ( /*ss */ ; ss <= sss; ss++) if (isspace (*ss) && isspace (ss[1]) && isspace (ss[2]) && (!ispunct (ss[-1]) || isspace (ss[3]))) { Line.PossibleTable = 1; break; } // Check out the condition of white space at the beginning of // the line. if (isspace (*s)) Line.BeginsWhite = 1; Leading = 0; for (ss = s; isspace (*ss); ss++) if (*ss == ' ') Leading++; else if (*ss == '\t') Leading = (Leading + 8) & ~7; else if (*ss == '\f') /* Line.FormFeedPrior = 1 */ ; Line.WeirdSequences = (*ss == '|'); Line.BeginsQuote = (*ss == '\'' || *ss == '\"'); if (Leading < 64) Line.Leading = Leading; else Line.Leading = 63; if (*ss == '\0') Line.Empty = 1; else { Line.RaggedStart = (Leading > 8 && 0 != (Leading & 7) && 0 != (Leading % 5)); // Check the various capitalization things. sss = ss; if (*sss == '-' && sss[1] == '-') { sss += 2; while (isspace (*sss)) sss++; } // Note that could have "( or (". if (*sss == '\"' || *sss == '\'' || *sss == '(' || *sss == '{') sss++; if (*sss == '\"' || *sss == '\'' || *sss == '(' || *sss == '{') sss++; if (IsPureUpper (*sss)) Line.VerseCap = 1; if (IsPureUpper (*ss)) { Line.CapFirstChar = 1; Line.VerseCap = 1; Line.CapFirstWord = 1; for (sss = ss; isalpha (*sss);) if (IsLower (s, &sss)) { Line.CapFirstWord = 0; break; } if (Line.CapFirstWord) { Line.AllCaps = 1; for (; *sss;) if (IsLower (s, &sss)) { Line.AllCaps = 0; break; } } } // Check if the line specifically indicates a chapter heading. if (MatchSpecialWords (Dataset, ss, &Line)) Line.BeginsBook = 1; if (Line.Offset == Dataset->TextStart) Line.Scanned = 1; // Check if the line begins with a Roman numeral. Line.BeginsRoman = isroman (*ss); for (sss = ss + 1; *sss && !isspace (*sss) && !ispunct (*sss); sss++) if (!isroman (*sss)) { Line.BeginsRoman = 0; break; } if (Line.BeginsRoman) { siv = sss; if (*siv == '.') siv++; while (isspace (*siv)) siv++; if (*siv == '\0') Line.OnlyRoman = 1; } Line.WhiteAfterRoman = (Line.BeginsRoman && isspace (*sss)); Line.SpecialRoman = (Line.BeginsRoman && sss == ss + 1 && (*ss == 'I' || *ss == 'X' || *ss == 'C')); if (Line.SpecialRoman) { // If "I" (or "I." is the ONLY thing on the line, // then it's just as good as any other Roman number. if (*sss == '.') sss++; for (; isspace (*sss); sss++); if (*sss == '\0') Line.SpecialRoman = 0; } // See if there's a period at the end of the line. for (sss = ss; *sss && *sss != '\n'; sss++); for (; sss > s && isspace (*sss); sss--) if (*sss == '\f') /* Line.FormFeedAfter = 1 */ ; Line.WeirdSequences = Line.WeirdSequences || (*sss == '|'); if (sss > s && IsEndPeriod (sss[0])) Line.EndPeriod = 1; if (sss > s + 1 && IsEndPeriod (sss[-1]) && (sss[0] == '\"' || sss[0] == '\'')) Line.EndPeriod = 1; if (sss - ss <= Dataset->ShortLineSize) Line.Short = 1; if (sss - ss <= Dataset->ReallyShortLineSize) Line.ReallyShort = 1; } } // Look for other weird stuff. Line.WeirdSequences = Line.WeirdSequences || NULL != strstr (s, ".....") || NULL != strstr (s, ". . . . .") || NULL != strstr (s, "-----") || NULL != strstr (s, "___"); // Write the line data. if (1 != fwrite (&Line, sizeof (Line), 1, Dataset->LineFile)) { fprintf (stderr, "Disk-write error.\n"); ReturnValue = 3; goto Done; } } ReturnValue = 0; Done: Dataset->NumLines = ftell (Dataset->LineFile) / sizeof (LineRecord); return (ReturnValue); }