///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: CheckGutenbergHeader.c Purpose: This analyzes the input text file to see if it has a Project Gutenberg header. Mods: 08/31/01 RSB Began. 11/02/01 RSB Added GPL disclaimer and reformatted somewhat for first web release. 12/27/01 RSB Now add MarkEndGutenbergHeader when the header is not found (allowing non-PG files to be marked up). There's no certain way to detect the Project Gutenberg header. This function applies the following heuristic: 1. The phrase "Project Gutenberg" and the word "etext" must appear in some line within the first 50 lines of the file; and 2. The string "*end*" must appear within the first 500 lines of the file. If so, the header is assumed to begin with the first non-blank line of the file. The first actual line of text is assumed to be the first non-blank line after the line containing "*end*". */ #include #include #include #include "AutoMark.h" #define START_MAX 50 #define END_MAX 500 //------------------------------------------------------------------------ // Convert a string to upper case. static void StrUpr (char *s) { for (; *s != '\0'; s++) *s = toupper (*s); } //------------------------------------------------------------------------ int CheckGutenbergHeader (AnalysisDataset * Dataset) { int Count; char s[256], ss[256]; unsigned long Offset = 0; MarkupRecord Mark; Mark.Offset = 0; s[sizeof (s) - 1] = 0; fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); // Search for the start of the header. for (Count = 0; Count < START_MAX; Count++) { if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile)) goto Bypass; StrUpr (s); if (NULL != strstr (s, "PROJECT GUTENBERG") && NULL != strstr (s, "ETEXT")) break; } if (Count >= START_MAX) goto Bypass; // Search for the end of the header. for (; Count < END_MAX; Count++) { if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile)) goto Bypass; StrUpr (s); if (NULL != strstr (s, "*END*")) break; } if (Count >= END_MAX) goto Bypass; // Okay, find the first non-blank line. for (;;) { Offset = ftell (Dataset->InputFile); if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile)) goto Bypass; // See if the line has any content. if (1 == sscanf (s, "%s", ss)) break; } // Well, it has been found. Let's add the markups for the beginning and // the end of the header. Mark.Offset = 0; Mark.Type = MarkJumpPastGutenbergHeader; fwrite (&Mark, sizeof (Mark), 1, Dataset->MarkupFile); Mark.Offset = Offset; Bypass: // Come here if no header found. Mark.Type = MarkEndOfGutenbergHeader; fwrite (&Mark, sizeof (Mark), 1, Dataset->MarkupFile); Dataset->TextStart = Offset; return (Mark.Offset != 0); }