#!/usr/bin/perl # # dbsort # Copyright (C) 1991-2001 by John Heidemann # $Id: dbsort,v 1.34 2004/05/19 18:34:27 johnh Exp $ # # This program is distributed under terms of the GNU general # public license, version 2. See the file COPYING # in $dblibdir for details. # sub usage { print <opterr(0); my($ch); while ($dbopts->getopt) { $ch = $dbopts->opt; if ($ch eq 'd') { $debug++; } elsif ($ch eq 'M') { $max_mem = $dbopts->optarg; } elsif ($ch eq 'T') { $ENV{'TMPDIR'} = $dbopts->optarg; } elsif ($ch eq 'i' || $ch eq 'I') { die "$prog: -i and -I not yet supported in dbsort.\n"; } elsif ($ch eq '?') { &usage; } else { # got a db op. Push it back on and break. $dbopts->ungetopt; last; }; }; &usage if ($#ARGV < 0); my($perl_mem_overhead) = 50; # approx. bytes of overhead for each record in mem $max_mem /= 4; # perl seems to take about 10x more memory than you'd expect &readprocess_header; &write_header(); sub sort_row_col_fn { my($row, $colname, $n) = @_; return '$sf' . $n . '[$' . $row . ']'; # ' } ($compare_code, $enter_code, $num_cols) = &generate_compare_code ('custom_compare', 'sort_row_col_fn', @ARGV); my($mem_count) = 0; die "$prog: no columns were specified as the sort key.\n" if ($num_cols < 0); my($enter_memory_code) = "sub custom_enter {\n" . " my(\$i) = \@_;\n" . $enter_code . # " print \"enter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" . "}\n" . "sub custom_memory_enter {\n" . " my(\$i, \$mem_ref) = \@_;\n" . $enter_code; # " print \"cmenter: \\\$sf0[\$i] = \$sf0[\$i]\\n\";\n" . $enter_memory_code .= " print STDERR \${\$mem_ref},\"\\n\" if (\$mem_count++ % 1000 ==0);\n" if ($mem_debug); # $enter_memory_code .= " print STDERR \${\$mem_ref},\"\\n\";\n" if ($mem_debug); $enter_memory_code .= " \${\$mem_ref} += $perl_mem_overhead + length(\$rawdata[\$#rawdata])\n"; foreach (0..$num_cols) { $enter_memory_code .= "\t\t + length(" . sort_row_col_fn('i', undef, $_) . ')'; }; $enter_memory_code .= ";\n &segment_overflow() if (\${\$mem_ref} > $max_mem);\n}"; eval $enter_memory_code; $@ && die "$0: eval: $@\n"; if ($debug) { print STDERR "COMPARE_CODE:\n$compare_code\nENTER_CODE:\n$enter_memory_code\n"; exit(1) if ($debug > 1); }; # # Handle large things in pieces if necessary. # # call &segment_start to init things, # &segment_overflow to close one segment and start the next # &segment_merge to put them back together again. # # Note that we don't invoke the merge code unless the data # exceeds some threshold size, so small sorts happen completely # in memory. # # Once we give up on memory, all the merging happens by making # passes over the disk files. # my(@sortedp, @rawdata, @p, @files_to_merge, $i, $memory_used); sub segment_start { $i = -1; $memory_used = 0; # undef(@sortedp, @rawdata, @p); # free the mem (maybe next line does that too) undef @sortedp; undef @rawdata; undef @p; @sortedp = @rawdata = @p = (); } sub segment_overflow { my($done) = @_; # sort the segment @sortedp = sort custom_compare @p; # pass on the data, either to a tmp file stdout if ($#files_to_merge >= 0 || $memory_used > $max_mem) { push(@files_to_merge, db_tmpfile(OUT)); } else { open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n"; }; foreach (@sortedp) { print OUT $rawdata[$_]; }; close OUT; # clean up memory usage # and try again print "memory used: $memory_used\n" if ($debug); &segment_start; } # # &segment_merge merges the on-disk files we've built up # in the work queue @files_to_merge. # # Changed Nov. 2001: try to process the work queue in # a file-system-cache-friendly order (based on ideas from # "Information and Control in Gray-box Systems" by # the Arpaci-Dusseau's at SOSP 2001. # # Idea: each "pass" through the queue, revsere the processing # order so that the most recent data (that's hopefully # in memory) is handled first. # # This algorithm isn't perfect (sometimes if there's an odd number # of files in the queue you reach way back in time, but most of # the time it's quite good). # # Also, in an ideal world $max_mem actually would be some sizable # percentage of memory, and so this whole optimization would # be useless because there would be no spare memory for the file system # cache. But for saftey reasons (because we don't know how much # RAM there is, and there is multiprocessing, etc.), $max_mem # is almost always hugely conservative. As of Nov. 2001 it defaults to # 10MB, but most workstations have >= 512MB memory. # sub segment_merge { return if ($#files_to_merge < 0); # keep track of how often to reverse my($files_before_reversing_queue) = 0; # Merge the files in a binary tree. while ($#files_to_merge >= 0) { # Each "pass", reverse the queue to reuse stuff in memory. if ($files_before_reversing_queue <= 0) { @files_to_merge = reverse @files_to_merge; $files_before_reversing_queue = $#files_to_merge + 1; print "reversed queue, $files_before_reversing_queue before next reverse.\n" if ($debug); }; $files_before_reversing_queue -= 2; # Pick off the two next files for merging. my(@fns); die "$0: segment_merge, odd number of segments.\n" if ($#files_to_merge == 0); push(@fns, shift @files_to_merge); push(@fns, shift @files_to_merge); # send the output to another file, or stdout if we're done if ($#files_to_merge >= 0) { push(@files_to_merge, db_tmpfile(OUT)); } else { open(OUT, ">-") || die "$0: cannot reopen STDOUT.\n"; }; print "merging $fns[0] and $fns[1] to " . ($#files_to_merge >=0 ? $files_to_merge[$#files_to_merge] : "STDOUT") . "\n" if ($debug); merge_to_out(@fns); close OUT; # verify($files_to_merge[$#files_to_merge]) if ($#files_to_merge >= 0); foreach (@fns) { db_tmpfile_cleanup($_); }; }; } # This function is very custom for debugging. # sub verify { # my($fn) = @_; # open(F, "<$fn") || die; # my($last); # my($i) = 0; # while () { # $i++; # $last = $_ if (!defined($last)); # if ($last > $_) { # die "bogus on line $i\n"; # }; # }; # close F; # } sub merge_to_out { my(@fh) = qw(A B); my($j); foreach $j (0..1) { $fh[$j] = new FileHandle; $fh[$j]->open("<$_[$j]") || die "$0: cannot open $_[$j].\n"; &merge_read_one($fh[$j], $j) || die "$0: $_[$j] is empty.\n"; }; my($winner); $a = 0; $b = 1; # for custom_compare for (;;) { $winner = &custom_compare > 0 ? 1 : 0; # print "\$sf0[0] = $sf0[0], \$sf0[1] = $sf0[1], \$winner = $winner, $rawdata[$winner]"; print OUT $rawdata[$winner]; # refill buffer if (!&merge_read_one($fh[$winner], $winner)) { # $winner is exhausted. Drain !$winner's buffer, then break and finish below print OUT $rawdata[!$winner]; last; }; }; # finish up !$winner # while (<$fh[!$winner]>) returns "A"--a perl bug in 5.004_04 # work around: use eof/getline methods. while (!$fh[!$winner]->eof) { # print "clearing $fh[!$winner]\n"; print OUT $fh[!$winner]->getline; }; foreach (0..1) { close $fh[$_]; }; } sub merge_read_one { my($fh, $index) = @_; $_ = scalar <$fh>; return undef if (!defined($_)); # out of data $rawdata[$index] = $_; &split_cols; # print "read from $fh into $i, $_"; &custom_enter($index);; return 1; } # # read in and set up the data # &segment_start; @files_to_merge = (); while () { # NEEDSWORK: should buffer comments to a file, not memory. next if (&delayed_pass_comments); push (@rawdata, $_); $i++; push (@p, $i); &split_cols; &custom_memory_enter($i, \$memory_used); # $@ && die("$prog: internal eval error: $@.\n"); }; # handle end case &segment_overflow if ($i >= 0); &segment_merge; &delayed_flush_comments; print "# | $prog ", join(" ", @orig_argv), "\n"; exit 0;