///////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: Markup.c Purpose: This is the analysis function used by AutoMark.c Mods: 08/30/01 RSB Began. 11/02/01 RSB Added GPL disclaimer and reformatted somewhat for first web release. This file analyzes the input file, and writes MarkupRecord records to the markup file. The markup records are written in the same order they will eventually appear within the HTML. I was going to write this to accomodate either DOS or Unix end-of-line conventions, but at least at first, I've decided to just assume the text file is in the native format, so that lines are terminated with \n. */ #include #include "AutoMark.h" //--------------------------------------------------------------------- int Markup (AnalysisDataset * Dataset) { unsigned SizeSum; int NumSamples; int i, j, MaxInGroup; char s[256]; // First, let's analyze the input file to see if it has a header. // The various analysis functions for different header types return // non-zero when that particular header type has been found. When it // returns zero, the header hasn't been found, so checks for other // header types can be carried out. if (0 == CheckGutenbergHeader (Dataset)) { } // Let's determine the aproximate margins. What we do is to take the // longest line out of each group of fify lines, and then average them. // We don't include the Gutenberg header, nor any partial group at the // end of the file. fseek (Dataset->InputFile, Dataset->TextStart, SEEK_SET); s[sizeof (s) - 1] = 0; for (SizeSum = NumSamples = 0; NumSamples < 100; SizeSum += MaxInGroup, NumSamples++) { for (MaxInGroup = i = 0; i < 50; i++) { if (NULL == fgets (s, sizeof (s) - 1, Dataset->InputFile)) break; j = strlen (s); if (j > MaxInGroup) MaxInGroup = j; } if (i < 50) break; } if (NumSamples == 0) { Dataset->ShortLineSize = 50; Dataset->ReallyShortLineSize = 35; } else { SizeSum /= NumSamples; //Dataset->ShortLineSize = (5 * SizeSum) / 7; //Dataset->ReallyShortLineSize = SizeSum / 2; Dataset->ShortLineSize = (10 * SizeSum) / 13; Dataset->ReallyShortLineSize = (5 * SizeSum) / 9; } // Let's mark paragraphs. if (MarkBody (Dataset)) { fprintf (stderr, "Error marking-up text.\n"); return (1); } return (0); }