#!/usr/bin/perl -w

#
# dbcolmultiscale
# Copyright (C) 2001 by John Heidemann <johnh@isi.edu>
# $Id: dbcolmultiscale,v 1.6 2003/05/23 04:15:43 johnh Exp $
#
# This program is distributed under terms of the GNU general
# public license, version 2.  See the file COPYING
# in $dblibdir for details.
#
sub usage {
	print STDERR <<END;
usage: $0 [-d] [-s SmallestTimescale] TimeField DataField

Dbcolmultiscale computes the sums and rates of a set of samples
taken at times (given by TimeField) with values given by DataField
across multiple timescales.  Timescales start at the value given by
SmallestTimescale (defaulting to 1 unit) and increasing by powers of
two.

The output is a table listing the time of each sample period, the
timescale it sums, the timescale index,
a jittered timescale index, the sum, and the rate (sum
dividied by timescale).

More formally, the output is

    t_i s_i si_i si'_i sum_{t_i,s_i} rate_{t_i,s_i}

where t_i is the time that the period begins
s_i is the timescale for this period in units of time
si_i is the timescale index (:= lg( (s_i - FirstTime) / SmallestTimescale) )
    (where FirstTime is the time of the first event)
si'_i := si_i + h(t)  where h(t) is some hash function to "jitter" the
  scale index, possibly in some regular way (such as mod)
sum_{t_i,s_i} := sum_{t_i,s_{i-1}} + sum{t_i+s_{i-1},s_{i-1}}
rate_{t_i,s_i} := sum_{t_i,s_i} / s_i

The input source must be sorted numerically by TimeField.
By default dbcolmultiscale sorts its input;
it will run more efficiently if given the -S option
to tell it the data is already sorted.

This program consumes O(lg duration) memory,
where duration is defined as time time between the first and last events.
If the data is not pre-sorted, the program requires O(number of records)
disk space.

Options:
    -s LowestTimescale    specify the shortest timescale to consider
				(defaults to 1 unit)
    -S                    data is pre-sorted
    -d			  debugging

Sample input:
#h time size
# This case has bursty data in the first half, and
# smooth data in the second half.
0 0
1 8
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1

Sample command:
cat TEST/dbcolmultiscale_bursty.in | dbcolmultiscale -S time size

Sample output:
#h period_start timescale timescale_index jittered_index sum rate
# This case has bursty data in the first half, and
# smooth data in the second half.
0	1	0	-0.15	0	0
1	1	0	-0.14	8	8
0	2	1	0.85	8	4
2	1	0	-0.13	0	0
3	1	0	-0.12	0	0
2	2	1	0.86	0	0
0	4	2	1.85	8	2
4	1	0	-0.11	0	0
5	1	0	-0.1	0	0
4	2	1	0.87	0	0
6	1	0	-0.09	0	0
7	1	0	-0.08	0	0
6	2	1	0.88	0	0
4	4	2	1.86	0	0
0	8	3	2.85	8	1
8	1	0	-0.07	1	1
9	1	0	-0.06	1	1
8	2	1	0.89	2	1
10	1	0	-0.05	1	1
11	1	0	-0.04	1	1
10	2	1	0.9	2	1
8	4	2	1.87	4	1
12	1	0	-0.03	1	1
13	1	0	-0.02	1	1
12	2	1	0.91	2	1
14	1	0	-0.01	1	1
15	1	0	0	1	1
14	2	1	0.92	2	1
12	4	2	1.88	4	1
8	8	3	2.86	8	1
0	16	4	3.85	16	1
#  | dbcolmultiscale -S time size

Post-processing:
To visualize with xgraph:
  ... | dbcol jittered_index rate | xgraph -nl -p
To do stats on the rates:
  ... | dbmultistats -q 4 timescale_index rate 
Or both:
  ... | dbcolmultiscale_to_gnuplot

END
# '
	exit 1;
}

BEGIN {
    $dbbindir = "/home/johnh/BIN/DB";
    $dblibdir = "/home/johnh/BIN/DB";
    push(@INC, $dblibdir);
}
use DbGetopt;
require "$dblibdir/dblib.pl";

@orig_argv = @ARGV;
my($prog) = &progname;
my($smallest_timescale) = 1;
my($debug) = undef;
my($sorting_required) = 1;
my($dbopts) = new DbGetopt("ds:S?", \@ARGV);
my($ch);
while ($dbopts->getopt) {
    $ch = $dbopts->opt;
    if ($ch eq 's') {
	$smallest_timescale = $dbopts->optarg;
    } elsif ($ch eq 'd') {
	$debug = 1;
    } elsif ($ch eq 'S') {
	$sorting_required = 0;
    } else {
	&usage;
    };
};

&usage if ($#ARGV != 1);
my($timecol, $datacol) = @ARGV;

# handle sorting, if necessary
if ($sorting_required) {
    open(SORTED, "$dbbindir/dbsort -n $timecol |") || die("$prog: cannot run dbsort over input.\n");
    open(STDIN, "<&SORTED") || die "$0: cannot dup SORTED.\n";
};

&readprocess_header;
die ("$prog: unknown column name ``$timecol''.\n") if (!defined($colnametonum{$timecol}));
my($timef) = $colnametonum{$timecol};
die ("$prog: unknown column name ``$datacol''.\n") if (!defined($colnametonum{$datacol}));
my($dataf) = $colnametonum{$datacol};


#
# start the output
#
&write_header(qw(period_start timescale timescale_index jittered_index sum rate));


my($first_time, $last_time);
my(@period_start) = undef;
my(@timescale_duration) = ($smallest_timescale);
my(@sum) = (0);
my(@other_sum) = (undef);
my(@other_period_start) = (undef);
my($maxscale_summed_i) = 0;  # counts from zero

sub jitter {
    my($index, $time, $timescale) = @_;
    my($jitter) = int(($time - $first_time) / $timescale) % 30;
    return $index + ($jitter - 15) / 100.0;
}

sub output_timescale {
    my($index) = @_;
    &write_these_cols($period_start[$index],
	$timescale_duration[$index],
	$index,
	jitter($index, $period_start[$index], $timescale_duration[$index]),
	$sum[$index],
	$sum[$index] / $timescale_duration[$index]);
};

sub complete_timescale {
    my($index) = @_;

    # are we finishing the first half or the second?
    if (!defined($other_sum[$index])) {
	# first half, just output the record
	&output_timescale($index);
	# now start work on the new record
	$other_sum[$index] = $sum[$index];
	$other_period_start[$index] = $period_start[$index];
    } else {
        # ok, we're doing the 2nd half, which means we have to recurse
        # output the record
        &output_timescale($index);
        # compute the next level
        $sum[$index+1] = $other_sum[$index] + $sum[$index];
        $timescale_duration[$index+1] = 2 * $timescale_duration[$index];
        $period_start[$index+1] = $other_period_start[$index];
        &complete_timescale($index+1);
        # now reset our level
        $other_sum[$index] = undef;
    };
    $sum[$index] = 0;
    $period_start[$index] += $timescale_duration[$index];
}


# read data
while (<STDIN>) {
    &pass_comments() && next;
    &split_cols;

    my($time) = $f[$timef];
    my($data) = $f[$dataf];

    # initialization
    if (!defined($first_time)) {
	$first_time = $last_time = $period_start[0] = $time;
    };
    die "$0: input is not sorted by time ($time < $last_time)\n"
        if ($time < $last_time);
    $last_time = $time;
    # handle the zero level by hand
    if ($time - $period_start[0] < $smallest_timescale) {
	# just more data in the current period
	$sum[0] += $data;
	next;
    };
    # ...overflowed period, so percolate if necessary
    #   until the data point is in the current period.
    while ($time - $period_start[0] >= $smallest_timescale) {
	&complete_timescale(0);  # percolate
    };
    # now initialize the new lowest level
    $sum[0] = $data;
};

die "$0: no input\n" if (!defined($first_time));

# Terminte the final entry, assuming it's complete
# (even though it may not be since we might have missed
# additional data that would have fitted in that timescale).
# Note that this only "completes" the lowest timescale
# and not necessarily any partially complete large timescales.
# It's not clear that this is any more justified.
# (But this additional call here does guarantee that if we have
# exactly 2^n evenly spaced datapoints we get them all---
# see the dbcolmultiscale_flat test case for an example).
&complete_timescale(0);


# close up shop
print "#  | $prog ", join(" ", @orig_argv), "\n";
exit 0;

if (0) {
   <SORTED>;
};