#!/usr/bin/perl # # dbcolscorrelate # Copyright (C) 1998-2001 by John Heidemann # $Id: dbcolscorrelate,v 1.7 2003/05/23 04:15:44 johnh Exp $ # # This program is distributed under terms of the GNU general # public license, version 2. See the file COPYING # in $dblibdir for details. # sub usage { print STDERR <getopt) { $ch = $dbopts->opt; if ($ch eq 'a') { $bogus_are_ignored = 0; } else { &usage; }; }; &usage if ($#ARGV < 1); my(@xfcols) = @ARGV; my(@xfs); # # Shunt the data to a separate file. # my($tmp) = db_tmpfile(TMP); my($header_line) = 1; while () { if ($header_line) { &process_header($_); my($xfcol); foreach $xfcol (@xfcols) { die ("$prog: unknown column ``$xfcol''.\n") if (!defined($colnametonum{$xfcol})); push(@xfs, $colnametonum{$xfcol}); }; $header_line = undef; }; print TMP $_; }; close TMP; close STDIN; # # Figure out stats on each column. # my($stats) = []; my($i, $j); foreach $i (0..$#xfcols) { my($xfcol) = $xfcols[$i]; my($xf) = $xfs[$i]; # NEEDSWORK: should pass $bogus_are_ignored open(FROMSTATS, "$dbbindir/dbstats $xfcol <$tmp |") || die "$prog: cannot run dbstats\n"; my($header) = scalar(); $header =~ /^$headertag_regexp/ or die "$prog: dbstats returns bogus header.\n"; my(@statsnames) = split(/$header_fsre/, $header); shift @statsnames; shift @statsnames while ($statsnames[0] =~ /^\-/); my($stats_line) = scalar(); my(@stats) = split(/$fsre/, $stats_line); foreach (0..$#stats) { $stats->[$i]{$statsnames[$_]} = $stats[$_]; }; close FROMSTATS; }; # reopen the file for our work. open(STDIN, "<$tmp") || die "$prog: cannot reopen $tmp.\n"; &readprocess_header; # # Figure correlation. # my($sum_zs) = []; my($n) = []; my($names) = []; foreach $i (0..$#xfs) { foreach $j (0..$#xfs) { next if ($i >= $j); $sum_zs->[$i][$j] = 0; $n->[$i][$j] = 0; $names->[$i][$j] = ($#xfs == 1) ? "correlation" : "$xfcols[$i]_$xfcols[$j]"; }; }; while () { &delayed_pass_comments() && next; &split_cols; # figure z scores my(@xs, @zs); foreach $i (0..$#xfs) { my($x) = &force_numeric($f[$xfs[$i]], $bogus_are_ignored); push(@xs, $x); my($z) = defined($x) ? ($x - $stats->[$i]{'mean'}) / $stats->[$i]{'stddev'} : undef; push(@zs, $z); }; # figure correlation sums foreach $i (0..$#xfs) { foreach $j (0..$#xfs) { next if ($i >= $j); next if (!defined($zs[$i]) || !defined($zs[$j])); $sum_zs->[$i][$j] += $zs[$i] * $zs[$j]; ($n->[$i][$j])++; }; }; }; # # Output the results. # my(@names); my(@correlations); foreach $i (0..$#xfs) { foreach $j (0..$#xfs) { next if ($i >= $j); push(@names, $names->[$i][$j]); push(@correlations, $n->[$i][$j] == 0 ? "-" : sprintf("$default_format", $sum_zs->[$i][$j] / $n->[$i][$j])); }; }; &write_header(@names); &write_these_cols(@correlations); &delayed_flush_comments(); print "# | $prog ", join(" ", @orig_argv), "\n"; exit 0; # error supression { my($dummy) = $default_format; }