#!/usr/bin/perl -w # # dbcolhisto # Copyright (C) 1997-1998 by John Heidemann # $Id: dbcolhisto,v 1.21 2003/05/23 04:15:41 johnh Exp $ # # This program is distributed under terms of the GNU general # public license, version 2. See the file COPYING # in $dblibdir for details. # sub usage { print STDERR <max" buckets are created. By default, the last bucket includes max (and is thus infintimessally larger than the other buckets). This irregularity can be removed with the -I option. Sample input: #h name id test1 a 1 80 b 2 70 c 3 65 d 4 90 e 5 70 f 6 90 Sample command: cat DATA/grades.jdb | dbcolhisto -s 0 -e 100 -n 10 test1 Sample output: #h low histogram 0 0 10 0 20 0 30 0 40 0 50 0 60 1 70 2 80 1 90 2 # | ./dbcolhisto -s 0 -e 100 -n 10 test1 Related programs: dbcolpercentile This program requires constant memory and O(size of data) temporary disk space. END exit 1; } BEGIN { $dblibdir = "/home/johnh/BIN/DB"; push(@INC, $dblibdir); } use DbGetopt; require "$dblibdir/dblib.pl"; my(@orig_argv) = @ARGV; my($prog) = &progname; my($bucket_width, $bucket_start, $bucket_end, $bucket_count, $graphical_output); my($last_inclusive) = 1; my($bogus_are_ignored) = 1; my($dbopts) = new DbGetopt("ae:gIn:s:w:?", \@ARGV); my($ch); while ($dbopts->getopt) { $ch = $dbopts->opt; if ($ch eq 'e') { $bucket_end = $dbopts->optarg; } elsif ($ch eq 'n') { $bucket_count = $dbopts->optarg; } elsif ($ch eq 'a') { $bogus_are_ignored = 0; } elsif ($ch eq 's') { $bucket_start = $dbopts->optarg; } elsif ($ch eq 'w') { $bucket_width = $dbopts->optarg; } elsif ($ch eq 'g') { $graphical_output = 1; } elsif ($ch eq 'I') { $last_inclusive = 0; } else { &usage; }; }; &usage if ($#ARGV != 0); &readprocess_header; my($xfcol) = $ARGV[0]; my($xf) = $colnametonum{$xfcol}; die ("$prog: unknown column name ``$xfcol''.\n") if (!defined($xf)); my($min, $max); my($n) = 0; my($save_data_filename); $save_data_filename = db_tmpfile(SAVE_DATA); print SAVE_DATA "$col_headertag data\n"; while () { &delayed_pass_comments && next; &split_cols; $x = &force_numeric($f[$xf], $bogus_are_ignored); next if (!defined($x)); print SAVE_DATA "$x\n"; # sigh, back to a string $min = $x if (!defined($min) || $x < $min); $max = $x if (!defined($max) || $x > $max); $n++; }; if ($n == 0) { print STDERR "$prog: histogram impossible with no input\n"; exit 1; } elsif ($n == 1) { print STDERR "$prog: histogram impossible with singleton input\n"; exit 1; }; # # Figure out bucket parameters. # Yuck. Constraint solving in Perl. # my($n_defined) = (defined($bucket_start) ? 1 : 0) + (defined($bucket_end) ? 1 : 0) + (defined($bucket_width) ? 1 : 0) + (defined($bucket_count) ? 1 : 0); if ($n_defined >= 4) { die "$prog: parameters over-specified.\n"; } elsif ($n_defined == 3) { # fall through, clean up handles it. } elsif ($n_defined == 2) { if (defined($bucket_start) && defined($bucket_end)) { $bucket_count = 10; } elsif (defined($bucket_start) && defined($bucket_width)) { $bucket_count = 10; } elsif (defined($bucket_start) && defined($bucket_count)) { $bucket_end = $max; } elsif (defined($bucket_end) && defined($bucket_width)) { $bucket_count = 10; } elsif (defined($bucket_end) && defined($bucket_count)) { $bucket_start = $min; } elsif (defined($bucket_width) && defined($bucket_count)) { my($mid) = ($max - $min) / 2 + $min; $bucket_start = $mid - $bucket_width * $bucket_count / 2; }; # Figure the rest out below. } elsif ($n_defined == 1) { if (defined($bucket_start)) { $bucket_end = $max; $bucket_count = 10; } elsif (defined($bucket_end)) { $bucket_start = $min; $bucket_count = 10; } elsif (defined($bucket_width) || defined($bucket_count)) { $bucket_start = $min; $bucket_end = $max; }; } elsif ($n_defined < 1) { $bucket_start = $min; $bucket_end = $max; $bucket_count = 10; }; # clean up $bucket_start = $bucket_end - $bucket_width * $bucket_count if (!defined($bucket_start)); $bucket_end = $bucket_start + $bucket_width * $bucket_count if (!defined($bucket_end)); $bucket_width = ($bucket_end - $bucket_start) / $bucket_count if (!defined($bucket_width)); $bucket_count = ($bucket_end - $bucket_start) / $bucket_width if (!defined($bucket_count)); $bucket_width += 0.0; # # Compute the histogram. # my(@buckets) = (0) x $bucket_count; my($low_bucket, $high_bucket) = (0, 0); close SAVE_DATA; open (SAVE_DATA, "<$save_data_filename") || die "$prog: cannot open $save_data_filename\n"; my($header) = scalar(); while () { chomp; $_ += 0.0; my($b) = ($_ - $bucket_start) / ($bucket_width); if ($b < 0) { $low_bucket++; } elsif ($b >= $bucket_count) { if (($_ == $high_bucket || $b == $bucket_count) && $last_inclusive) { $buckets[$bucket_count]++; } else { $high_bucket++; }; } else { $buckets[int($b)]++; }; } close SAVE_DATA; # # Display the histogram # sub format { my($n) = @_; return ($graphical_output ? ("*" x ($n)) : $n); } &write_header(qw(low histogram)); my($break) = $bucket_start; print "<$break\t" . &format($low_bucket) . "\n" if ($low_bucket); for ($i = 0; $i <= $#buckets; $i++) { &write_these_cols($break, &format($buckets[$i])); $break += $bucket_width; }; &write_these_cols(">=$break\t", &format($high_bucket)) if ($high_bucket); &delayed_flush_comments; print "# | $prog ", join(" ", @orig_argv), "\n"; exit 0; if (0) { my $x; $x = $col_headertag = $f[0] = $colnametonum{'foo'}; }