#!/usr/bin/perl -w # # dbstats # Copyright (C) 1991-2000 by John Heidemann # $Id: dbstats,v 1.42 2006/08/25 05:11:24 johnh Exp $ # # This program is distributed under terms of the GNU general # public license, version 2. See the file COPYING # in $dblibdir for details. # sub usage { print STDERR <getopt) { $ch = $dbopts->opt; if ($ch eq 'c') { $conf_pct = $dbopts->optarg; } elsif ($ch eq 'f') { $format = $dbopts->optarg; } elsif ($ch eq 'a') { $bogus_are_ignored = 0; } elsif ($ch eq 'm') { $do_median = 1; $save_data = 1; } elsif ($ch eq 'S') { $sorting_required = 0; } elsif ($ch eq 'q') { $ntile = $dbopts->optarg; $save_data = 1; } else { &usage; }; }; &usage if ($#ARGV != 0); my($xfcol) = $ARGV[0]; &readprocess_header; die ("$prog: unknown column ``$xfcol''.\n") if (!defined($colnametonum{$xfcol})); my($xf) = $colnametonum{$xfcol}; my($n) = 0; my($sx) = 0; my($sxx) = 0; my($minmaxinit) = 0; my($save_data_filename); if ($save_data) { $save_data_filename = db_tmpfile(TMP); close TMP; my($sort_command) = ($sorting_required ? "|$dbbindir/dbsort -n data " : ""); open(SAVE_DATA, "$sort_command >$save_data_filename") || die "$prog: cannot run dbsort.\n"; print SAVE_DATA "$col_headertag data\n"; } # # Read and process the data. # while () { &delayed_pass_comments() && next; &split_cols; $x = &force_numeric($f[$xf], $bogus_are_ignored); next if (!defined($x)); $n++; $sx += $x; $sxx += $x * $x; print SAVE_DATA "$x\n" if ($save_data); if (!$minmaxinit) { $min = $max = $x; $minmaxinit = 1; } else { $min = $x if ($x < $min); $max = $x if ($x > $max); }; }; if ($n == 0) { die "$0: no input\n"; exit 1; }; # # Compute stats. # $mean = $sx / $n; # stddev = s, not s^2, approximates omega # Check for special cases: # $n <= 1 => divide by zero # all same data value => can sometimes get very small or negative # stddev (due to rounding error) # for these cases, $stddev = 0 $stddev = ($n <= 1 || $max == $min) ? 0 : sqrt(($sxx - $n * $mean * $mean) / ($n - 1)); if ($mean == 0) { $pct_rsd = "na"; } else { $pct_rsd = ($stddev / $mean) * 100; }; # # Confidence intervals from "Probability and Statistics for Engineers", # Second Edition, 1986, Scheaffer and McClave, p. 242. # if ($n <= 1) { $conf_half = 0; } else { my $conf_alpha = (1.0 - $conf_pct) / 2.0; $conf_half = &DbTDistr::t_distribution($n - 1, $conf_alpha) * $stddev / sqrt($n); }; $conf_low = $mean - $conf_half; $conf_high = $mean + $conf_half; # # Compute median/quartile. # sub round_up { my($x) = @_; my($xi) = int($x); return ($x > $xi) ? $xi+1 : $xi; } my($median, @q); my($real_ntile) = ($ntile ? $ntile : 2); if ($save_data && $n == 0) { $median = $mean; push(@q, ($mean) x $real_ntile); } elsif ($save_data) { close SAVE_DATA; open (SAVE_DATA, "<$save_data_filename") || die "$prog: cannot open $save_data_filename.\n"; # To handle the ugly case of having more ntiles than # data, we detect it and replicate the data until we have more # replicated_data than ntiles. my($replicate_data) = ($n >= $real_ntile+1) ? 1 : round_up(($real_ntile+1.0)/$n); my($replicated_n) = $n * $replicate_data; # Also note that the array of quartiles and the number of # data elements read are both 1-based and not 0-based like # most perl stuff. This is to make the math easier. $median_i = round_up($replicated_n / 2); $ntile_frac = ($replicated_n + 0.0) / ($real_ntile + 0.0); my($x, $last_x, $next_q_i); @q = (0); # note that q is primed with 0 my($replicates_left) = 0; $x = ; # consume header die "$0: internal error." if ($x !~ /^#h/); # my($i); # note that i counts from 1! for ($i = 1; $#q+1 < $real_ntile; $i++) { if (--$replicates_left <= 0) { $x = ; chomp $x; $replicates_left = $replicate_data; # Verify sorted order (in case the user lied to us # about pre-sorting). if (defined($last_x) && $x < $last_x) { my($info) = ($sorting_required ? " (internal error in dbsort)" : " (user specified -S for pre-sorted data but it is unsorted)"); die "$prog: cannot process data that is out of order between $last_x and $x $info.\n"; }; $last_x = $x; }; if ($i == $median_i) { $median = $x; }; $next_q_i = (round_up($ntile_frac * ($#q + 1.0) )) if (!defined($next_q_i)); # print "d: q=$#q nq=$next_q_i i=$i\n"; if ($i == $next_q_i) { push(@q, $x); $next_q_i = undef; }; }; }; # # Output the results. # my(@headers) = (qw(mean stddev pct_rsd conf_range conf_low conf_high conf_pct sum sum_squared min max n)); push(@headers, "median") if ($do_median); if ($ntile) { foreach (1..($ntile-1)) { push(@headers, "q$_"); }; }; &write_header(@headers); my(@o) = (); foreach ($mean, $stddev, $pct_rsd, $conf_half, $conf_low, $conf_high, $conf_pct, $sx, $sxx, $min, $max) { push (@o, ($_ eq 'na' ? $_ : (sprintf "$format", $_))); }; push (@o, $n); push(@o, $median) if ($do_median); push(@o, @q[1..($ntile-1)]) if ($ntile); &write_these_cols(@o); &delayed_flush_comments(); print "# | $prog " . join(" ", @orig_argv) . "\n" . "#\t\t$conf_pct confidence intervals assume normal distribution and small n.\n"; exit 0; # supress warings # error supression { my($dummy) = $f[0]; $dummy = $default_format; $dummy = $col_headertag; $dummy = ; }