/////////////////////////////////////////////////////////////////////////// /* Copyright 2001 Ronald S. Burkey This file is part of GutenMark. GutenMark is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. GutenMark is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GutenMark; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Filename: NetTrain.c Purpose: This code is used to train and to apply a neural-net recognizer to lines of text from a PG etext. Mods: 12/06/01 RSB Began. */ /////////////////////////////////////////////////////////////////////////// #include #include // An enumeration of all of the possible classifications of a // line. These are mutually exclusive, and are fixed by the // requirements of the recognition problem. #define NUM_OUTPUTS 14 #define OUT_BLANK 0 // A blank line. #define OUT_PREFATORY 1 // A line in the prefatory area. #define OUT_HEADING 2 // Part of a section heading. #define OUT_SUBHEADING 3 // A subtitle of a section heading. #define OUT_PARA 4 // A line from a normal paragraph. #define OUT_VERSE 5 // A line of verse. #define OUT_BLOCK 6 // A line from a block-quote. #define OUT_BLOCKSIG 7 // A signature for a block quote. #define OUT_CENTER 8 // A centered line. #define OUT_BIGPARASEP 9 // Extra white space between paragraphs #define OUT_FOOTSEP 10 // Separator between text and footnotes #define OUT_FOOT 11 // A footnote. #define OUT_TABLEHEAD 12 // A header for a table. #define OUT_TABLE 13 // Data from a table. // The markup for these output types added to the beginnings of // lines of training data store as text in a file. By default, // in training, if there is no line-markup present then the line // is assumed to be OUT_BLANK (if blank) or OUT_PARA (if not). static const char *OutMarkups[NUM_OUTPUTS] = { "<>", "", "", "", "", "", "", "", "
", "", "", "", "", "" }; // An enumeration of all the metric values computed for a single line. // These can be increased later by enriching the code. #define NUM_METRICS 7 #define MET_START 0 // 1 if prior to the prefatory area. #define MET_BLANK 1 // 1 if blank. #define MET_LENGTHRATIO 2 // Ratio of line length to average. #define MET_CAPRATIO 3 // Ratio of CAPS to complete line. #define MET_BEGINCAP 4 // 1 if begins capitalized. #define MET_BEGINSPACES 5 // Number of spaces at front of line. #define MET_ENDPUNCT 6 // End of line is end-of-sentence. #define MET_SPECIALWORD 7 // Line begins with BOOK, CHAPTER, etc. static const char *MetricNames[NUM_METRICS] = { "START", "BLANK", "LRATIO", "CRATIO", "BEGINC", "BEGINB", "ENDP", "SPECIAL" }; // The input values to the net consist of various metric values for // the line in question, along with some subset of the metrics for // the neighboring lines, and other miscellaneous items like a // startcode. These are not enumerated within the code, because they // can be changed without recoding by means of a configuration file. // The number of hidden nodes is also selected by the configuration file. // The format of the configuration file is ASCII, as follows: // Line 1 Name of weight-storage input and output files. // Line 2 Number of hidden nodes. // One line for each text line, with space-delimited numeric fields: // Field 1 Index offset from current line. // One field for each metric value. // For example: // MyFileIn.nnw MyFileOut.nnw // 20 // 0 0 1 2 3 4 5 6 7 // 1 0 1 2 3 4 // -1 0 1 2 3 4 // 2 0 1 2 // -2 0 1 2 // 3 2 // -3 2 // Here the weights are intially taken from in MyFileIn.nnw, and are written // to MyFileOut.nnw after additional training. There are 20 hidden nodes, // and 25 input values to the net: all 8 metrics from the current line, // 5 (0-4) metrics from the immediately adjacent lines, 3 metrics (0-2) // from the lines just beyond that, and 1 metric (2) from the next lines // beyond those. typedef struct { signed char LineOffset; // Allows for lines from -128..+127. unsigned char MetricIndex; // Up to 256 metrics per line. } InputPointer; typedef struct { char *InputWeightFile, *OutputWeightFile; int NumHidden; int NumInputs; InputPointer *Pointers; } NnConfiguration; //------------------------------------------------------------------------ // Reads our neural net configuration file. Returns NULL on error, // or a pointer to a newly-allocated configuration structure otherwise. NnConfiguration * ReadNnConfig (const char *ConfigFilename) { #define MAX_INPUTS 1000; char s[1000], ss[1000], sss[1000]; int i, j, k, Indices[32]; InputPointer Pointers[MAX_INPUTS]; FILE *ConfigFile = NULL; NnConfiguration *Config = NULL; ConfigFile = fopen (ConfigFilename, "r"); if (ConfigFile == NULL) goto Error; if (NULL == fgets (s, sizeof (s), ConfigFile)) goto Error; if (2 != sscanf (s, "%s%s", ss, sss)) goto Error; Config = (NnConfiguration *) calloc (1, sizeof (NnConfiguration)); if (Config == NULL) goto Error; Config->NumInputs = 0; Config->InputWeightFile = (char *) malloc (strlen (ss) + 1); if (Config->InputWeightFile == NULL) goto Error; strcpy (Config->InputWeightFile, ss); Config->OutputWeightFile = (char *) malloc (strlen (sss) + 1); if (Config->OutputWeightFile == NULL) goto Error; strcpy (Config->OutputWeightFile, sss); if (NULL == fgets (s, sizeof (s), ConfigFile)) goto Error; if (1 != sscanf (s, "%d", &(Config->NumHidden))) goto Error; while (NULL != fgets (s, sizeof (s), ConfigFile)) { i = sscanf (s, "%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d%d" "%d%d%d%d%d%d%d%d%d%d%d%d%d", &j, &Indices[0], &Indices[1], &Indices[2], &Indices[3], &Indices[4], &Indices[5], &Indices[6], &Indices[7], &Indices[8], &Indices[9], &Indices[10], &Indices[11], &Indices[12], &Indices[13], &Indices[14], &Indices[15], &Indices[16], &Indices[17], &Indices[18], &Indices[19], &Indices[20], &Indices[21], &Indices[22], &Indices[23], &Indices[24], &Indices[25], &Indices[26], &Indices[27], &Indices[28], &Indices[29], &Indices[30], &Indices[31], ); if (i > 1) { if (j < -128 || j > 127) goto Error;; i--; for (k = 0; k < i; k++) { if (Config->NumInputs >= MAX_INPUTS) goto Error; if (Indices[k] < 0 || Indices[k] >= NUM_METRICS) goto Error; Inputs[Config->NumInputs].LineOffset = j; Inputs[Config->NumInputs].Metric = Indices[k]; Config->NumInputs++; } } } fclose (ConfigFile); Config->Pointers = (InputPointer *) calloc (ConfigNumInputs, sizeof (Pointer)); if (Config->Pointers == NULL) goto Error; for (i = 0; i < Config->NumInputs; i++) Config->Pointers[i] = Inputs[i]; return (Config); Error: if (Config != NULL) { if (ConfigFile != NULL) fclose (ConfigFile); if (Config->InputWeightFile != NULL) free (Config->InputWeightFile); if (Config->OutputWeightFile != NULL) free (Config->OutputWeightFile); if (Config->Pointers != NULL) free (Config->Pointers); free (Config); } return (NULL); #undef MAX_INPUTS } //---------------------------------------------------------------------- // A main program that can be used to train a neural net from a // marked-up etext. int main (void) { NnConfigurationFile *Config; Config = ReadNnConfig ("GutenMark.net"); if (Config == NULL) return (1); nnwork LineRecognizer (Config->NumInputs, Config->NumHidden, NUM_OUTPUTS); float Outputs[NUM_OUTPUTS]; float *Inputs = (float *) calloc (sizeof (float), Config->NumInputs); } #ifdef 0 if (character == 't') { // train, do 50 runs. for (int i = 0; i < 50; i++) { result[0] = 0.9; // Train with even data brain.train (test_data_even_1, result, 0.0000000005, 0.2); brain.train (test_data_even_2, result, 0.0000000005, 0.2); brain.train (test_data_even_3, result, 0.0000000005, 0.2); brain.train (test_data_even_4, result, 0.0000000005, 0.2); brain.train (test_data_even_5, result, 0.0000000005, 0.2); brain.train (test_data_even_6, result, 0.0000000005, 0.2); result[0] = 0.1; // Train with odd data (counterexample) brain.train (test_data_odd_1, result, 0.0000000005, 0.2); brain.train (test_data_odd_2, result, 0.0000000005, 0.2); brain.train (test_data_odd_3, result, 0.0000000005, 0.2); brain.train (test_data_odd_4, result, 0.0000000005, 0.2); brain.train (test_data_odd_5, result, 0.0000000005, 0.2); brain.train (test_data_odd_6, result, 0.0000000005, 0.2); cout << i + 1 << " training iterations completed.\n"; } brain.save ("odd_even.nnw"); // save the network } else if (!brain.load ("odd_even.nnw")) { // try to load from the file cerr << "File not found.\n"; return -1; } // Now run the network with one odd and one even test value. cout << "Odd test (should be 0.1):\n"; brain.run (test_data_odd_1, result); cout << result[0] << endl; cout << "Even test (should be 0.9):\n"; brain.run (test_data_even_3, result); cout << result[0] << endl; return 0; } #endif