#!/usr/bin/perl

#
# dbcoldiff
#
# Copyright (C) 1991-1998 by John Heidemann <johnh@isi.edu>
# $Id: dbcoldiff,v 1.20 2006/08/25 05:29:42 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print <<END;
usage: $0 [-f format] [-c ConfRating] [-h HypothesizedDifference] m1c ss1c n1c m2c ss2c n2c
usage: $0 [-f format] [-c ConfRating] m1c n1c m2c n2c
    	Produce statistics on the difference of sets of random variables.

	Random variables are specified by:
	m1c, m2c --- column numbers of means of random variables
	ss1c, ss2c --- column numbers of standard deviations of random variables
	n1c, n2c --- counts of number of samples for each random variable

	(If you need to compute these values, use dbstats.)

Outputs up to ten new columns:
	diff --- difference of RV 2 - RV 1
	diff_pct --- percentage difference (2-1)/1
	diff_conf_{half,low,high}
	diff_conf_pct_{half,low,high}
	    --- half confidence intervals and low and high values
	    	for absolute and relative confidence
        t_test --- T-test value for the given hypothesized difference
	t_test_result --- given the confidence rating, does the test pass
	t_test_break --- the hypothesised value that is break-even point
		for the T-test.
	t_test_break_pct --- break-even point as a percent of m1c.

Confidence intervals are not printed if standard deviations are not provided.
Confidence intervals assume normal distributions with common variances.

T-tests are only computed if a hypothesized difference is provided.
Hypothesized diferences should be proceeded by <=, >=, =.
T-tests assume normal distributions with common variances.

Sample input (example 6.12 from Scheaffer and McClave):
#h title mean2 stddev2 n2 mean1 stddev1 n1
example6.12 0.17 0.0020 5 0.22 0.0010 4

Sample command:
cat data.jdb | dbcoldiff mean2 stddev2 n2 mean1 stddev1 n1

Sample output:
#h title mean2 stddev2 n2 mean1 stddev1 n1 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high
example6.12	0.17	0.0020	5	0.22	0.0010	4	0.05	29.412	0.0026138	0.047386	0.052614	1.5375	27.874	30.949
#  | dbcoldiff mean2 stddev2 n2 mean1 stddev1 n1

Sample input (example 7.10 from Scheaffer and McClave):
#h title x2 ss2 n2 x1 ss1 n1
example7.10 9 35.22 24.44 9 31.56 20.03

Run:
dbcoldiff -h '<=0' x2 ss2 n2 x1 ss1 n1

Output:
#h title n1 x1 ss1 n2 x2 ss2 diff diff_pct diff_conf_half diff_conf_low diff_conf_high diff_conf_pct_half diff_conf_pct_low diff_conf_pct_high t_test t_test_result
example7.10 9 35.22 24.44 9 31.56 20.03 3.66 0.11597 4.7125 -1.0525 8.3725 0.14932 -0.033348 0.26529 1.6465 not-rejected
#  | /global/us/edu/ucla/cs/ficus/users/johnh/BIN/DB/dbcoldiff -h <=0 x2 ss2 n2 x1 ss1 n1

END
    exit 1;
}


BEGIN {
    $dblibdir = "/home/johnh/BIN/DB";
    push(@INC, $dblibdir);
}
use DbGetopt;
use DbTDistr;
require "$dblibdir/dblib.pl";

$conf_pct = 0.95;
$format = "%.5g";
$hyp_diff = undef;
$hyp_class = undef;
@orig_argv = @ARGV;
my($prog) = &progname;
my($dbopts) = new DbGetopt("c:h:f:?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 'c') {
	$conf_pct = $dbopts->optarg;
     } elsif ($ch eq 'h') {
     	die ("$prog: bad hypothesis difference " . $dbopts->optarg . ".\n")
	    unless ($dbopts->optarg =~ /^\s*([<>=!]+)\s*([0-9\.]+)/);
	($hyp_class, $hyp_diff) = ($1, $2);
     } elsif ($ch eq 'f') {
	$format = $dbopts->optarg;
    } else {
	&usage;
    };
};

if ($hyp_class =~ /\<\=/) {
    $hyp_class = -1;
} elsif ($hyp_class =~ /\>\=/) {
    $hyp_class = 1;
} elsif ($hyp_class =~ /==?/) {
    $hyp_class = 0;
} elsif ($hyp_class ne undef) {
    die("$prog: bad hypothesis direction (must be <=, >=, or =, was $hyp_class).\n");
};

($STATS_DIFF, $STATS_CONF, $STATS_T_TEST) = (0..10);
# rest of args
if ($#ARGV == 5) {
    ($m1c, $ss1c, $n1c, $m2c, $ss2c, $n2c) = @ARGV;
    $stats_kind = ($hyp_diff eq undef) ? $STATS_CONF : $STATS_T_TEST;
} elsif ($#ARGV == 3) {
    ($m1c, $n1c, $m2c, $n2c) = @ARGV;
    $stats_kind = $STATS_DIFF;
    die("$prog: T-tests require standard deviations.\n") if ($hyp_diff ne undef);
} else {
    &usage;
};

# computed off args
my $conf_alpha = (1.0 - $conf_pct) / 2.0;


&readprocess_header;
foreach (@ARGV) {
    die ("$prog: unknown column ``$_''.\n")
        if (!defined($colnametonum{$_}));
};
($m1c, $ss1c, $n1c, $m2c, $ss2c, $n2c) =
    ($colnametonum{$m1c}, $colnametonum{$ss1c}, $colnametonum{$n1c}, $colnametonum{$m2c}, $colnametonum{$ss2c}, $colnametonum{$n2c});

#
# new columns
#
$diff_f = &col_create("diff");
$pctdiff_f = &col_create("diff_pct");
if ($stats_kind >= $STATS_CONF) {
    $confdelta_f = &col_create("diff_conf_half");
    $conflow_f = &col_create("diff_conf_low");
    $confhigh_f = &col_create("diff_conf_high");
    $pctconfdelta_f = &col_create("diff_conf_pct_half");
    $pctconflow_f = &col_create("diff_conf_pct_low");
    $pctconfhigh_f = &col_create("diff_conf_pct_high");
};
if ($stats_kind >= $STATS_T_TEST) {
    $t_test_f = &col_create("t_test");
    $t_test_result_f = &col_create("t_test_result");
    $t_test_break_f = &col_create("t_test_break");
    $t_test_break_pct_f = &col_create("t_test_break_pct");
};
&write_header();


while (<STDIN>) {
    &pass_comments && next;
    &split_cols;

    $diff = $f[$m2c] - $f[$m1c];
    if ($f[$m1c] == 0.0) {
	$pctdiff = "n/a";
    } else {
	$pctdiff = $diff / $f[$m1c] * 100.0;
    };
    $confdelta = $conflow = $confhigh = 
    	$pctconfdelta = $pctconflow = $pctconfhigh = 
	$t_test = $t_test_result =
	$t_test_break = $t_test_break_pct = 
    			"n/a";
    if ($stats_kind >= $STATS_CONF && $f[$n1c] > 1 && $f[$n2c] > 1) {
    	# basic stuff
	$degrees_of_freedom = ($f[$n1c] + $f[$n2c] - 2);
	$ssp = (($f[$n1c] - 1) * $f[$ss1c] * $f[$ss1c] + ($f[$n2c] - 1) * $f[$ss2c] * $f[$ss2c]) / $degrees_of_freedom;
	$sqrt_ssp_inverses = sqrt($ssp * (1/$f[$n1c] + 1/$f[$n2c]));
	$t_value = &DbTDistr::t_distribution($degrees_of_freedom, $conf_alpha);

	# confidence intervals
	$confdelta = $t_value * $sqrt_ssp_inverses;
	$conflow = $diff - $confdelta;
	$confhigh = $diff + $confdelta;

	# t-test
	if ($stats_kind >= $STATS_T_DIST && $sqrt_ssp_inverses != 0.0) {
	    $t_test = ($diff - $hyp_diff) / $sqrt_ssp_inverses;
	    $t_test_result = "not-rejected";
	    if ($hyp_class < 0) {
	    	$t_test_result = "rejected"
		    if ($t_test > $t_value);
	    } elsif ($hyp_class > 0) {
	    	$t_test_result = "rejected"
		    if ($t_test < $t_value);
	    } else {
	    	$t_test_result = "rejected"
		    if (&abs($t_test) > $t_value);
	    };
	    # also compute the break-even point
	    $t_test_break = $diff - $t_value * $sqrt_ssp_inverses;
	    $t_test_break_pct = $t_test_break / $f[$m1c] * 100.0
	    	if ($f[$m1c] != 0.0);
	};

	# percentages
    	if ($f[$m1c] != 0.0) {
            $pctconfdelta = $confdelta / $f[$m1c] * 100.0;
            $pctconflow = $conflow / $f[$m1c] * 100.0;
            $pctconfhigh = $confhigh / $f[$m1c] * 100.0;
    	};
    };


    $f[$diff_f] = sprintf("$format", $diff);
    $f[$pctdiff_f] = sprintf("$format", $pctdiff);
    if ($stats_kind >= $STATS_CONF) {
        $f[$confdelta_f] = sprintf("$format", $confdelta);
	$f[$conflow_f] = sprintf("$format", $conflow);
	$f[$confhigh_f] = sprintf("$format", $confhigh);
	$f[$pctconfdelta_f] = sprintf("$format", $pctconfdelta);
	$f[$pctconflow_f] = sprintf("$format", $pctconflow);
	$f[$pctconfhigh_f] = sprintf("$format", $pctconfhigh);
    };
    if ($stats_kind >= $STATS_T_TEST) {
    	$f[$t_test_f] = sprintf("$format", $t_test);
	$f[$t_test_result_f] = $t_test_result;
    	$f[$t_test_break_f] = $t_test_break;
    	$f[$t_test_break_pct_f] = sprintf("$format", $t_test_break_pct);
    };

    &write_cols;
};

print "#  | $prog " . join(" ", @orig_argv) . "\n";
exit 0;