#!/usr/local/bin/perl -w =head1 NAME count.pl =head1 SYNOPSIS count.pl takes as input one or more text files and calculate the ngram frequency for the whole corpus. =head1 DESCRIPTION See perldoc README.pod =head1 AUTHOR Satanjeev Banerjee, bane0025@d.umn.edu Ted Pedersen, tpederse@d.umn.edu =head1 BUGS =head1 SEE ALSO home page: http://www.d.umn.edu/~tpederse/nsp.html mailing list: http://groups.yahoo.com/group/ngram/ =head1 COPYRIGHT Copyright (C) 2000-2003, Ted Pedersen and Satanjeev Banerjee This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =cut # count.pl version 0.57 # ############################################################################### # # ------- CHANGELOG --------- # #version date programmer List of changes change-id # # 0.53 01/06/2003 Amruta (1) Added Perl Regex ADP.53.1 # support for stop # option # # 01/06/2003 Amruta (2) Added AND & OR modes # for stop option ADP.53.2 # making AND default # # 01/07/2003 Amruta (3) Introduced # --nontoken option ADP.53.3 # # 0.57 06/30/2003 Ted (1) show remove value TDP.57.1 # in extended output # # 07/01/2003 Ted (2) if destination file TDP.57.2 # found, check for # source before proceeding ############################################################################### #----------------------------------------------------------------------------- # Start of program #----------------------------------------------------------------------------- # we have to use commandline options, so use the necessary package! use Getopt::Long; # first check if no commandline options have been provided... in which case # print out the usage notes! if ( $#ARGV == -1 ) { &minimalUsageNotes(); exit; } # now get the options! GetOptions( "verbose", "recurse", "version", "help", "histogram=s", "frequency=i", "window=i", "stop=s", "newLine", "extended", "token=s", "ngram=i", "remove=i", "set_freq_combo=s", "get_freq_combo=s","nontoken=s"); # if help has been requested, print out help! if ( defined $opt_help ) { $opt_help = 1; &showHelp(); exit; } # if version has been requested, show version! if ( defined $opt_version ) { $opt_version = 1; &showVersion(); exit; } if (defined $opt_recurse) { $opt_recurse = 1; } if ( defined $opt_frequency ) { $cutOff = $opt_frequency; } else { $cutOff = 0; } ## TDP.57.1 START if ( defined $opt_remove ) { $removeOff = $opt_remove } else { $removeOff = 0; } ## TDP.57.1 FINISH if ( defined $opt_ngram ) { $ngram = $opt_ngram; } else { $ngram = 2; } if ($ngram <= 0) { print STDERR "Cannot have 'n' value of ngrams as less than 1\n"; askHelp(); exit(); } if ( defined $opt_window ) { $windowSize = $opt_window; } else { $windowSize = $ngram; if (defined $opt_verbose) { print "Using default window size = $windowSize\n"; } } if ($windowSize < $ngram || ($ngram == 1 && $windowSize != 1)) { print STDERR "Illegal value for window size. Should be >= size of ngram (1 if size of ngram is 1).\n"; askHelp(); exit(); } # get hold of the frequency combinations that we need to keep track # of, either from the file provided if (defined $opt_set_freq_combo) { readFreqCombo($opt_set_freq_combo); } # or, by default, everything possible else { getDefaultFreqCombos(); } if (defined $opt_get_freq_combo) { open (FREQ_COMBO_OUT, ">$opt_get_freq_combo") || die ("Couldnt open $opt_get_freq_combo"); for ($i = 0; $i < $combIndex; $i++) { for ($j = 1; $j <= $freqComb[$i][0]; $j++) { print FREQ_COMBO_OUT "$freqComb[$i][$j] "; } print FREQ_COMBO_OUT "\n"; } close (FREQ_COMBO_OUT); } # at the end of those two functions we should have with us the @freqComb # array! # check if tokens file has been supplied. if so, try to open it and extract # the regex's. if ( defined $opt_token ) { open (TOKEN, $opt_token) || die "Couldnt open $opt_token\n"; while() { chomp; s/^\s*//; s/\s*$//; if (length($_) <= 0) { next; } if (!(/^\//) || !(/\/$/)) { print STDERR "Ignoring regex with no delimiters: $_\n"; next; } s/^\///; s/\/$//; push @tokenRegex, $_; } close TOKEN; } else { push @tokenRegex, "\\w+"; push @tokenRegex, "[\.,;:\?!]"; } # create the complete token regex $tokenizerRegex = ""; foreach $token (@tokenRegex) { if ( length($tokenizerRegex) > 0 ) { $tokenizerRegex .= "|"; } $tokenizerRegex .= "("; $tokenizerRegex .= $token; $tokenizerRegex .= ")"; } # if you dont have any tokens to work with, abort if ( $#tokenRegex < 0 ) { print STDERR "No token definitions to work with.\n"; askHelp(); exit; } # --------------- # ADP.53.3 start # --------------- # Introducing new --nontoken option to remove any sequence of characters # that matches the regular expression specified by the --nontoken option. # With this, we also allow user to specify what is not a valid token. # Providing this option is important because the user can specify some # special character sequences which include tokens but need to be entirely # removed #if the non-token file is specified if(defined $opt_nontoken) { #check if the file exists if(-e $opt_nontoken) { #open the non token file open(NOTOK,"$opt_nontoken") || die "Couldn't open Nontoken file $opt_nontoken.\n"; while() { chomp; s/^\s+//; s/\s+$//; #handling a blank lines if(/^\s*$/) { next; } if(!(/^\//)) { print STDERR "Nontoken regular expression $_ should start with '/'\n"; exit; } if(!(/\/$/)) { print STDERR "Nontoken regular expression $_ should end with '/'\n"; exit; } #removing the / s from the beginning and the end s/^\///; s/\/$//; #form a single regex $nontoken_regex.="(".$_.")|"; } ## end of while # if no valid regexs are found in Nontoken file if(length($nontoken_regex)<=0) { print STDERR "No valid Perl Regular Experssion found in Nontoken file $opt_nontoken.\n"; exit; } chop $nontoken_regex; } ## end of if not-token exists else { print STDERR "Nontoken file $opt_nontoken doesn't exist.\n"; exit; } } ## end of if non-token option defined # Added --nontoken option functionality # ------------- # ADP.53.3 end # ------------- # having stripped the commandline of all the options etc, we should now be # left only with the source/destination files # so, first get hold of the destination file! $destination = shift; # check to see if a destination has been supplied at all... if ( !($destination ) ) { print STDERR "No output file (DESTINATION) supplied.\n"; askHelp(); exit; } # TDP.57.2 start (moved this) check for destination file and source file # before proceeding # whats left in the command line are paths. go thru them and salvage all # text files to be processed. the following function does just that, putting # all useful files in @sourceFiles :o) &getSourceFiles(@ARGV); # if not even one file found, complain and quit! if ($#sourceFiles == -1) { print STDERR "No input (SOURCE) file supplied!\n"; askHelp(); exit; } # TDP.57.2 finish (moved this) # check to see if destination exists, and if so, if we should overwrite... if ( -e $destination ) { print "Output file $destination already exists! Overwrite (Y/N)? "; $reply = ; chomp $reply; $reply = uc $reply; exit 0 if ($reply ne "Y"); } # having ascertained that we may open the destination file for output, lets # do so... open ( DST, ">$destination" ) || die "Couldn't open output file $destination"; # TDP.57.2 old location of check for source files # output the files found, if verbose set! if ( defined $opt_verbose ) { print "\nFound the following $index file(s) to source from: \n"; for ( $i = 0; $i < $index; $i ++ ) { print "$sourceFiles[$i]\n"; } print "\n"; } # get all the permutations for this ngram/windowSize combination. this tells # us which words to pick from a window to form the various ngrams @permutations = (); getPermutations($windowSize-1, $ngram-1, 0); # ngramTotal will contain the total number of ngrams found! $ngramTotal = 0; # now get the source files one by one from @sourceFiles, and process them in # a loop! foreach $source (@sourceFiles) { # we already know that the file exists... that is checked by # &getSourceFiles, so no need to check it again! just open and # proceed. open( SRC, "$source" ) || die "Cant open SOURCE file $source, quitting"; # having successfully opened the source file start reading it... if ( defined $opt_verbose ) { print "Accessing file $source.\n"; } # start off the window index which will tell us where in the window array # we are right now. this is a global variable to be used by processToken # to figure out what to do with a new token. $windex = 0; # the NEXT place in the window array to write to! # read in the file, tokenize and process the token thus found while () { # if we dont want ngrams to span across the new line, then every # time we process a new line, we need to refresh the window if ( defined $opt_newLine ) { $opt_newLine = 1; $windex = 0; } # --------------- # ADP.53.3 start # --------------- # Removing sequences of characters which are declared as non-tokens # These are detected and removed before checking for tokens because # those sequences which include valid tokens in them should be removed # since the whole sequence is declared as a non-token if(defined $nontoken_regex) { s/$nontoken_regex//g; } # Removed non-tokens from the input stream # ------------- # ADP.53.3 end # ------------- # now for this line, tokenize the line and send the token for # processing. while ( /$tokenizerRegex/g ) { $token = $&; processToken($token); } } } # that is the tokenizing and token-processing done! # now to put in the stop list, if its been provided if ( defined $opt_stop ) { # we have already checked that the stop list exists. open it and create # the stop hash open ( STP, $opt_stop ) || die ("Couldn't open the stoplist file $opt_stop\n"); # -------------- # ADP.53.1 start # -------------- # Perl Regex support for stop option # this will accept the stop tokens from the # stop file as Perl regular experssions # delimited by slashes /regex/ # each regex should appear on a separate line # commented code belongs to old version 0.51 # my %stopHash = (); version 0.51 code while ( ) { chomp; # version 0.51 code # s/^\s+//; # s/\s+$//; # if ( /^\/(.*)\/$/ ) # { # $stopHash{$1} = 1; # } #} # --------------- # ADP.53.2 start # --------------- # Adding support for AND and OR Stop modes # AND Mode will remove those ngrams which # consist of all stop words # OR Mode will remove those ngrams which # consist of at least one stop word # Default Mode will be AND Mode if(/\@stop.mode\s*=\s*(\w+)\s*$/) { $stop_mode=$1; if(!($stop_mode=~/^(AND|and|OR|or)$/)) { print STDERR "Requested Stop Mode $1 is not supported.\n"; exit; } next; } # -------------- # ADP.53.2 end # -------------- # accepting Perl Regexs from Stopfile s/^\s+//; s/\s+$//; #handling a blank lines if(/^\s*$/) { next; } #check if a valid Perl Regex if(!(/^\//)) { print STDERR "Stop token regular expression <$_> should start with '/'\n"; exit; } if(!(/\/$/)) { print STDERR "Stop token regular expression <$_> should end with '/'\n"; exit; } #remove the / s from beginning and end s/^\///; s/\/$//; #form a single big regex $stop_regex.="(".$_.")|"; } if(length($stop_regex)<=0) { print STDERR "No valid Perl Regular Experssion found in Stop file $opt_stop"; exit; } chop $stop_regex; # Added Perl Regex Support for Stop option # ------------ # ADP.53.1 end # ------------ # -------------- # ADP.53.2 start # -------------- # making AND a default stop mode if(!defined $stop_mode) { $stop_mode="AND"; } # ------------ # ADP.53.2 end # ------------ close STP; # having got the file, go thru the ngrams, removing the offending ngrams foreach (keys %ngramFreq) { @tempArray = split /<>/; # -------------- # ADP.53.2 start # -------------- # Adding AND/OR Modes for Stopping Ngrams containing atleast # one or all stop words # commented code from here belongs to version 0.51 #my $doStop = 1; my $doStop; #by default OR should get value 0 so that when any word matches #a stop token, we can remove the ngram if($stop_mode=~/OR|or/) { $doStop = 0; } #by default AND should get value 1 so that when any word doesn't #match a stop token, we can accept the ngram else { $doStop = 1; } for ($i = 0; $i <= $#tempArray; $i++ ) { # verion 0.51 code #if (!(defined ($stopHash{$tempArray[$i]}))) #{ # $doStop = 0; # last; #} # if mode is OR, remove the current ngram if # any word is a stop word if($stop_mode=~/OR|or/) { if($tempArray[$i]=~/$stop_regex/) { $doStop=1; last; } } # if mode is AND, accept the current ngram if # any word is not a stop word else { if(!($tempArray[$i]=~/$stop_regex/)) { $doStop=0; last; } } # Added support for AND and OR Stop modes # ------------- # ADP.53.2 end # ------------- } if ($doStop) { # remove this ngram and adjust all frequencies appropriately removeNgram($_); } } } # now to remove n-grams if the --remove option has been taken. if ( defined $opt_remove ) { foreach ( keys %ngramFreq ) { removeNgram($_) if ($ngramFreq{$_} < $opt_remove); } } # end of processing all the files. now to write out the information. if ( defined $opt_verbose ) { print "Writing to $destination.\n"; } if ( defined $opt_extended ) { $opt_extended *= 1; # print out the ngram size print DST "\@count.Ngram=$ngram\n"; # print out the window size used print DST "\@count.WindowSize=$windowSize\n"; # print out the frequency cut off used print DST "\@count.FrequencyCut=$cutOff\n"; ## TDP.57.1 START # print out the remove cut off used print DST "\@count.RemoveCut=$removeOff\n"; ## TDP.57.1 FINISH ########################################################################## # # # The following idea suggested by Otso Virtanen, ojtvirta@cs.helsinki.fi # # # ########################################################################## # print out the path/file name of the first file as input my $source; print DST "\@count.InputFilePath="; foreach $source (@sourceFiles) { print DST "$source "; } print DST "\n"; } # finally print out the total ngrams print DST "$ngramTotal\n"; foreach (sort { $ngramFreq{$b} <=> $ngramFreq{$a} } keys %ngramFreq) { # check if this is below the cut-off frequency to be displayed # as set by switch --frequency. if so, quit the loop last if ($ngramFreq{$_} < $cutOff); # get the components of this ngram my @words = split /<>/; # if a line starts with a single @, its a command (extended output). # if it starts with two consequtive @'s, then its a single 'literal' @. if ( $_ =~ /^@/ ) { print DST "@"; } print DST "$_"; # ngram # now print the frequency combo's requested my $j; for ($j = 0; $j < $combIndex; $j++) { my $tempString = ""; my $k; for ($k = 1; $k <= $freqComb[$j][0]; $k++) { $tempString .= "$words[$freqComb[$j][$k]]<>"; } $tempString .= $j; print DST "$frequencies{$tempString} "; } print DST "\n"; } # having done it all, close all open files... close SRC; close DST; # create histogram if necessary if (defined $opt_histogram) { createHistogram(); } # ... and thats it! :o) #----------------------------------------------------------------------------- # User Defined Function Definitions #----------------------------------------------------------------------------- # function to process tokens sub processToken { my $token = shift; if ($ngram > 1) { # first put the word into the window array! $window[$windex] = $token; # until we have enough to make our first ngram, just keep going! if ( $windex < $ngram-1 ) { $windex++; return; } # otherwise, create the ngrams! our method here will be to create all # possible ngrams that END with the token that's just come in. thus we # shall avoid the pitfall of creating the same ngram twice (a possibility # when windowSize > ngram). # we already have the permutations array. get em! my $permutationsIndex = 0; while ($permutationsIndex <= $#permutations) { my $ngramString = ""; my $okFlag = 1; for ($i = 0; $i < $ngram-1; $i++) { if ( $permutations[$permutationsIndex] < $windex ) { $ngramString .= $window[$permutations[$permutationsIndex]] . "<>"; } else { $okFlag = 0; } $permutationsIndex++; } if (!$okFlag) { next; } $ngramString .= "$window[$windex]<>"; # that is our ngram then! # increment the ngramTotal $ngramTotal++; # and the ngram freq hash. our output ngrams are going to # be sorted on this hash. we shall not show this frequency # tho... if this has to be shown, the corresponding combo # has to be in the loop below! $ngramFreq{$ngramString}++; # now increment the various frequencies according to the # @freqCombo array... my @words = split /<>/, $ngramString; my $j; for ($j = 0; $j < $combIndex; $j++) { my $tempString = ""; my $k; for ($k = 1; $k <= $freqComb[$j][0]; $k++) { $tempString .= "$words[$freqComb[$j][$k]]<>"; } $tempString .= $j; $frequencies{$tempString}++; } } # having dealt with all the new ngrams in this window, # increment the windex, if less than the size, or shift out # the first element of the array to make place for the next # word thats coming in! if ( $windex < $windowSize - 1 ) { $windex++; } else { shift @window; } } else # this is the case when ngram = 1 { my $ngramString = $token . "<>"; $ngramFreq{$ngramString}++; my $tempString = $token . "<>0"; $frequencies{$tempString}++; $ngramTotal++; } } # function to remove an ngram and adjust the various frequency counts # appropriately sub removeNgram { my $ngramString = shift; # first reduce the ngram total by the frequency of this ngram $ngramTotal -= $ngramFreq{$ngramString}; # now get hold of the component words my @words = split /<>/, $ngramString; # and reduce each combination frequency by the freq of this ngram my $j; for ($j = 0; $j < $combIndex; $j++) { my $tempString = ""; my $k; for ($k = 1; $k <= $freqComb[$j][0]; $k++) { $tempString .= "$words[$freqComb[$j][$k]]<>"; } $tempString .= $j; $frequencies{$tempString} -= $ngramFreq{$ngramString}; if ($frequencies{$tempString} <= 0) { delete $frequencies{$tempString}; } } # finally remove this ngram! delete $ngramFreq{$ngramString}; # and we are done! } # function to create a histogram given the ngramFreq hash of frequencies sub createHistogram { # check if output histogram file already exists if (-e $opt_histogram) { print "File $opt_histogram exists! Overwrite (Y/N)? "; $reply = ; chomp $reply; $reply = uc $reply; return if ($reply ne "Y"); } # having ascertained that we may open the histogram file for output, lets # do so... open ( HST, ">$opt_histogram" ) || die "Couldn't open $opt_histogram"; # now to construct the histogram hash... my %histogram = (); $histogram{$ngramFreq{$_}}++ foreach (keys %ngramFreq); # having done that, lets print out to the histogram file... print HST "Total ngrams = $ngramTotal\n"; printf HST "Number of n-grams that occurred %3d time(s) = %5d (%.2f percent)\n", $_, $histogram{$_}, ($histogram{$_}*$_*100)/$ngramTotal foreach (sort {$a<=>$b} keys %histogram); close HST; } # Function &getSourceFiles: function to take the command tail and # return an array of text files to be used to count! while going thru the # command line do the following processing: # # 1> if the string is a text file and can be opened, add it to the array. # 2> if the string is a directory name, find all text files in that directory, # and append to array. # 3> if the -r (recursive) option is set, go into all subdirectories of that # directory too, to do the above! sub getSourceFiles { # get the next commandline string... my $nextString = shift; $index = 0; while ( $nextString ) { if ( !( -e $nextString ) ) { # file doesn't exist... ignore! if ( defined $opt_verbose ) { print "File $nextString does not exist!\n"; } $nextString = shift; next; } if ( !( -r $nextString ) ) { # file can't be read... ignore! if ( defined $opt_verbose ) { print "File $nextString cant be read!\n"; } $nextString = shift; next; } if ( -d $nextString ) { # this is a directory, go and search this directory for text files &directorySearch( $nextString ); $nextString = shift; next; } if ( !( -T $nextString ) ) { # file is not a text file... ignore! if ( defined $opt_verbose ) { print "$nextString is not a text file!\n"; } $nextString = shift; next; } $sourceFiles[$index] = $nextString; $index++; $nextString = shift; } } # function to (possibly recursively) search inside the given directory for # text files sub directorySearch { my $directory = $_[0]; opendir DIR, $directory || "Couldnt open directory $directory!\n"; my @files = grep !/^\./, readdir DIR; @files = map "$directory/$_", @files; closedir DIR; my $file = ""; foreach $file (@files) { if ( ( -d $file ) && ( defined $opt_recurse ) ) { &directorySearch($file); } if ( ( -T $file ) ) { $sourceFiles[$index] = $file; $index++; } } } # function that takes two numbers and creates a (global) array of # numbers thusly: given the nos 5, 3 it creates the following array: # 0 1 2 0 1 3 0 1 4 0 2 3 0 2 4 0 3 4 1 2 3 1 2 4 1 3 4 2 3 4 # to be used to create all possible n grams within a given window # to generate above list, call function thusly: getPermutations(5,3,0). # 0 is mandatory to get the recursion started. # generated list will be in global array called permutation[] sub getPermutations { my $totalLength = shift; my $lengthReqd = shift; my $level = shift; my $i; if ($level == $lengthReqd) { for ($i = 0; $i < $lengthReqd; $i++ ) { push @permutations, $tempArray[$i]; } return; } my $start = ($level == 0) ? 0 : $tempArray[$level-1] + 1; my $stop = $totalLength - $lengthReqd + $level; for ($i = $start; $i <= $stop; $i++) { $tempArray[$level] = $i; getPermutations($totalLength, $lengthReqd, $level+1); } } # function to create the default frequency combinations to be computed # and output sub getDefaultFreqCombos { my $i; # first create the first index of the comb, that is the # combination that includes all the characters in the window $combIndex = 0; $freqComb[0][0] = $ngram; for ($i = 0; $i < $ngram; $i++) { $freqComb[0][$i+1] = $i; } $combIndex++; # now create the rest, starting with size 1 for ($i = 1; $i < $ngram; $i++) { createCombination(0, $i); } } # function to read in the user requested frequency combinations sub readFreqCombo { my $sourceFile = shift; # open the source file open (FREQ_SRC, $sourceFile) || die ("Couldnt open $sourceFile\n"); # read in the freq combo's one by one into the @freqComb array $combIndex = 0; while () { s/^\s*//; s/\s*$//; my @tempArray = split(/\s+/); # first how many words make up this combination $freqComb[$combIndex][0] = $#tempArray+1; # next the indices of the words. note that these indices # shouldnt exceed $ngram-1... we'll check for that here. my $i; for ($i = 1; $i <= $freqComb[$combIndex][0]; $i++) { $freqComb[$combIndex][$i] = $tempArray[$i-1]; # check! if ($freqComb[$combIndex][$i] >= $ngram) { printf STDERR ("Illegal index value at row %d column %d in file %s\n", $combIndex+1, $i, $sourceFile); exit; } } $combIndex++; } } sub createCombination { my $level = shift; my $size = shift; if ($level == $size) { $freqComb[$combIndex][0] = $size; my $i; for ($i = 1; $i <= $size; $i++) { $freqComb[$combIndex][$i] = $tempCombArray[$i-1]; } $combIndex++; } else { my $i; my $loopStart = (!$level)?0:$tempCombArray[$level-1]+1; for ($i = $loopStart; $i < $ngram; $i++) { $tempCombArray[$level] = $i; createCombination($level+1, $size); } } } # function to output a minimal usage note when the user has not provided any # commandline options sub minimalUsageNotes { print STDERR "Usage: count.pl [OPTIONS] DESTINATION SOURCE [[, SOURCE] ...]\n"; askHelp(); } # function to output help messages for this program sub showHelp { print "Usage: count.pl [OPTIONS] DESTINATION SOURCE [[, SOURCE] ...]\n\n"; print "Counts up the frequency of all n-grams occurring in SOURCE.\n"; print "Sends to DESTINATION the list of n-grams found, along with the\n"; print "frequencies of combinations of the n tokens that the n-gram is\n"; print "composed of. If SOURCE is a directory, all text files in it are\n"; print "counted.\n\n"; print "OPTIONS:\n\n"; print " --ngram N Creates n-grams of N tokens each. N = 2 by\n"; print " default.\n\n"; print " --window N Sets window size to N. Defaults to n-gram\n"; print " size above.\n\n"; print " --token FILE Uses regular expressions in FILE to create\n"; print " tokens. By default two regular expressions\n"; print " are provided (see README).\n\n"; print " --nontoken FILE Removes all characters sequences that match\n"; print " Perl regular expressions specified in FILE.\n\n"; print " --set_freq_combo FILE \n"; print " Uses the frequency combinations in FILE to\n"; print " decide which combinations of tokens to\n"; print " count in a given n-gram. By default, all\n"; print " combinations are counted.\n\n"; print " --get_freq_combo FILE \n"; print " Prints out the frequency combinations used\n"; print " to FILE. If frequency combinations have been\n"; print " provided through --set_freq_combo switch above\n"; print " these are output; otherwise the default\n"; print " combinations being used are output.\n\n"; print " --stop FILE Removes n-grams containing at least one (in\n"; print " OR mode) or all stop words (in AND mode).\n"; print " Stop words should be declared as Perl Regular\n"; print " expressions in FILE.\n\n"; print " --frequency N Does not display n-grams that occur less\n"; print " than N times.\n\n"; print " --remove N Ignores n-grams that occur less than N\n"; print " times. Ignored n-grams are not counted and\n"; print " so do not affect counts and frequencies.\n\n"; print " --newLine Prevents n-grams from spanning across the\n"; print " new-line character.\n\n"; print " --histogram FILE Outputs histogram to FILE. Tabulates how\n"; print " many times n-grams of a given frequency\n"; print " have occurred.\n\n"; print " --recurse If SOURCE is a directory, uses all files\n"; print " in SOURCE as well as all subdirectories of\n"; print " SOURCE recursively as input.\n\n"; print " --extended Outputs values of the above switches, if\n"; print " default values are not used.\n\n"; print " --verbose Outputs to stderr information about\n"; print " current program status.\n\n"; print " --version Prints the version number.\n\n"; print " --help Prints this help message.\n\n"; } # function to output the version number sub showVersion { print STDERR "count.pl - version 0.57\n"; print STDERR "Copyright (C) 2000-2003, Ted Pedersen & Satanjeev Banerjee\n"; print STDERR "Date of Last Update 07/01/03\n"; } # function to output "ask for help" message when the user's goofed up! sub askHelp { print STDERR "Type count.pl --help for help.\n"; } sub showFreqCombArray { my ($i, $j); for ($i = 0; $i < $combIndex; $i++) { print STDERR "$freqComb[$i][0]: "; for ($j = 1; $j <= $freqComb[$i][0]; $j++) { print STDERR "$freqComb[$i][$j] "; } print STDERR "\n"; } }