#!/usr/bin/perl -wT
use strict;
# Copyright (c) 2003 Jon Atkins http://www.jonatkins.com/
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
my $cacheversion = "0.6.4";
###### options ######
# this is added to the start of all files loaded/saved
# NOTE: can be a full path or a relative path
# NOTE: the cache MUST have write-access to the specified directory
# eg. my $prefix = "/tmp/gcache.example.com" would store files in "/tmp/gcache.example.com-*"
# eg. my $prefix = "data/data" would store files in a data directory below the location of the CGI script
my $prefix = "../cgi-data/gwebcache/data"; # will use "data-*" in the same directory as this cgi script
# a large number of URLs are stored, and clients get a random selection
# returned on each request.
# URLs are removed from the list once they are $maxurlage minutes old
# if URL testing is enabled (see $checkurls below) then caches
# are tested with a ping request. failed caches are also stored to prevent
# repeated retests on additional updates
# return this many alternate cache urls
my $maxurlret = 10; # suggested range: between 10 and 20
# store this many URLs
my $maxurlstore = 200; # suggested range: 200 to 500
# remove URLs after this many minutes
my $maxurlage = 1*24*60; # in minutes - default 1 day
# number of hosts to keep and return
my $maxhost = 20;
# maximum age of a host entry, in minutes
my $maxhostage = 6*60; # in minutes - default 12 hours by default
# set to non-zero to block clients which don't send a client and version parameter
# (this setting only applys to v1 requests - v2 requests require a client string)
# NOTE: in the original cache specs sending client/version was optional
my $blockv1anon = 0;
# log cache stats
my $enablestats = 1;
# minimum time, in minutes, between updates
my $updatelimit = 55; # suggested range: 30 - 55 minutes
# check each URL submitted works
my $checkurls = 0;
# HTTP proxy server to use when $checkurls is 1 - undef for no proxy
my $proxyhost = undef;
my $proxyport = undef;
# request rate limiting
# set either to zero to disable this feature
my $requestlimitcount = 0; # suggested value: 0, or 5 - 10
my $requestlimittime = 55; # suggested maximum 120 (2 hours)
# should we answer old-style v1 requests?
my $enablev1requests = 1;
# EXPERIMENTAL: see the notes at http://www.jonatkins.com/perlgcache/
# before enabling this feature.
my $enablev2requests = 0;
# note: if both v1/v2 are set to 0, act as if v1 is enabled
###### end of configuration ######
# NO USER OPTIONS AFTER HERE
use IO::Socket; # for testing of sent URLs
###### global vars #####
# globals: read time once - to avoid possible race conditions
# note: values set at the top of dorequest() - ready for FastCGI version
my $time;
my $day;
###### subroutines ######
# first, generic routines...
sub error
{
my ( @args ) = @_;
print "ERROR: ".(join ' ', @args)."\n";
exit;
}
sub paramdecode
{
my ( $query ) = @_;
my %param;
return () unless defined $query;
foreach my $arg ( split /&/, $query )
{
my ( $name, $val ) = split /=/, $arg;
next unless defined $name and defined $val;
$val =~ s/\+/ /g;
$val =~ s/%([0-9a-fA-F]{2})/chr hex $1/eg;
$param{$name} = $val;
}
return %param;
}
sub loadarrayhash
{
my ( $filename ) = @_;
my @array;
open IN, "$filename" or return ();
my $header = ;
return () unless defined $header;
chomp $header;
my @fields = split /\|/, $header;
foreach my $line ( )
{
chomp $line;
my @items = split /\|/, $line, scalar @fields;
my %item;
for ( my $i=0; $i < @items; $i++ )
{
$item{$fields[$i]} = defined $items[$i] ? $items[$i] : '';
}
push @array, \%item;
}
close IN;
#print Dumper ( \@array );
return @array;
}
sub savearrayhash
{
my ( $filename, @array ) = @_;
#TODO: proper error checking in here, so we don't loose data files when the
#disk is full
# (build the file in a string then write in one go, to make this easier..?
# may be better on file I/O performance this way)
my $tmpfile = "$filename.tmp.$$";
open OUT, "> $tmpfile" or die;
return if @array == 0;
my @fields = sort keys %{$array[0]};
print OUT join "|", @fields;
print OUT "\n";
foreach my $item ( @array )
{
my $first = 1;
foreach my $field ( @fields )
{
print OUT "|" unless $first;
$first = 0;
my $val = ${$item}{$field};
$val = '' unless defined $val;
print OUT "$val";
}
print OUT "\n";
}
close OUT;
rename $tmpfile, $filename or error "rename of $tmpfile to $filename failed";
}
sub loadhashhash
{
my ( $filename ) = @_;
# right - let's get loadarrayhash to do the work here...
my @array = loadarrayhash $filename;
my %hash;
foreach my $item ( @array )
{
my $key = ${$item}{KEY};
$hash{$key} = $item;
delete $hash{$key}{KEY};
}
#print Dumper \%hash;
return %hash;
}
sub savehashhash
{
my ( $filename, %hash ) = @_;
#print Dumper \%hash;
my @array;
foreach my $item ( keys %hash )
{
my %newitem;
%newitem = %{$hash{$item}};
$newitem{KEY} = $item;
push @array, \%newitem;
}
savearrayhash $filename, @array;
}
sub fixarrayfields
{
my ( $fieldsref, $arrayref ) = @_;
foreach my $item ( @{$arrayref} )
{
# add any missing fields...
foreach my $field ( keys %{$fieldsref} )
{
${$item}{$field} = ${$fieldsref}{$field} unless defined ${$item}{$field};
}
# .. and remove any not required...
foreach my $field ( keys %{$item} )
{
delete ${$item}{$field} unless exists ${$fieldsref}{$field};
}
}
}
sub getiptype
{
my ( $ip ) = @_;
#TODO: add support for IPv6 here...
return "invalid" unless $ip =~ m/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
my ( $a, $b, $c, $d ) = ( $1, $2, $3, $4 );
return "invalid" if ( $a<0 or $a>255 or $b<0 or $b>255 or $c<0 or $c>255 or $d<0 or $d>255 );
# don't allow 0.*.*.* (this net), 127.*.*.* (loopback) or 224-255.*.*.* (multicast, class-e, broadcast)
return "unusable" if ( $a==0 or $a==127 or $a>=224 );
# mark 10.*.*.* (private a), 169.254.*.* (local link), 172.16-31.*.* (private b), 192.0.2.* (test net), 192.168.*.* (private c)
return "private" if ( ($a==10) or ($a==169 and $b==254) or ($a==172 and ($b>=16 and $b<=31)) or ($a==192 and $b==0 and $c==2) or ($a==192 and $b==168) );
return "public";
}
# HTTP request routines - for testing of URLs....
# used for the internals of http requests...
sub socket_send_and_get
{
my ( $host, $port, @data ) = @_;
my ( $remote, $recv );
$remote = IO::Socket::INET->new ( Proto => 'tcp', PeerAddr => $host, PeerPort => $port, Timeout => 60 );
return undef unless $remote;
$remote->autoflush(1);
print $remote @data;
local $/ = undef;
$recv = <$remote>;
close $remote;
alarm 0;
return $recv;
}
# sends a http request to the given url and returns all data (header and body) as a single string.
sub http_request
{
my ( $url ) = @_;
if ( $url =~ m{^http://([a-zA-Z0-9-.]+)(?::(\d{1,5}))?(.*)$} )
{
my $host = $1;
my $port = $2;
my $path = $3;
$port = 80 unless defined $port;
$path = '/' if $path eq '';
if ( defined $proxyhost and defined $proxyport )
{
my $reply = socket_send_and_get $proxyhost, $proxyport, "GET $url HTTP/1.0\r\n", "Host: $host:$port\r\n", "User-agent: perlgcache/$cacheversion (via proxy)\r\n", "X-RequestingIP: $ENV{REMOTE_ADDR}\r\n", "\r\n";
return $reply;
}
my $reply = socket_send_and_get $host, $port, "GET $path HTTP/1.0\r\n", "Host: $host:$port\r\n", "User-agent: perlgcache/$cacheversion\r\n", "X-RequestingIP: $ENV{REMOTE_ADDR}\r\n", "\r\n";
return $reply;
}
return undef;
}
# does a http request for a text/* object, processes redirect as required
# splits the output into an array of lines (without line terminators) and returns an array of:
# ( $code, $redircount, @lines )
sub do_http_text_request
{
my ( $url, $redircount ) = @_;
$redircount = 0 unless defined $redircount;
# to stop redirect loops...
return ( -4, $redircount ) if $redircount >= 10;
my $reply = http_request $url;
return ( -1, $redircount ) unless defined $reply;
# HTTP headers should use \r\n to seperate lines - but for robustness clients can accept any format
# i'll add that here later maybe
my ( $header, $body ) = split /\r\n\r\n/, $reply, 2;
return ( -5, $redircount ) unless defined $header and defined $body;
my @header = split /\r\n/, $header;
my $statusline = shift @header;
return ( -2, $redircount ) unless defined $statusline;
return ( -3, $redircount ) unless ( $statusline =~ m{^HTTP/\d+\.\d+ +(\d\d\d) *(.*)$} );
my ( $code, $message ) = ( $1, $2 );
if ( $code =~ m/^3\d\d$/ )
{
# we've got a redirect - find the Location: header and follow it...
foreach my $headerline ( @header )
{
if ( $headerline =~ m{^location: +(.*)$}i )
{
# right - location headers *should* be absolute, but sometimes they're not...
my $newurl = $1;
if ( $newurl =~ m{^http:} )
{
# full path - no problem
return do_http_text_request ( $newurl, $redircount+1 );
}
elsif ( $newurl =~ m{^/} )
{
# server relative url - add current server and port
$url =~ m{^(http://.*?)/} or die;
$newurl = $1.$newurl;
return do_http_text_request ( $newurl, $redircount+1 );
}
else
{
# directory relative url - add current server and path
$url =~ m{^(http:.*/)} or die;
$newurl = $1.$newurl;
return do_http_text_request ( $newurl, $redircount+1 );
}
}
}
# if we get here, there was no Location header...
}
if ( $code ne '200' )
{
# bad code - fail
return ( $code, $redircount );
}
# right - we've got the data - now let's check the type...
foreach my $headerline ( @header )
{
if ( $headerline =~ m/^content-type: +(.*$)/i )
{
my $type = $1;
# error on non text/* content-types...
error ( -6, $redircount ) unless ( $type =~ m{^text/.*$} );
}
# (if we don't find a content-type, just assume it's text)
}
# it's good - now we need to sort out the body lines....
# fun, because they could be seperated with just '\n', '\r\n' or even '\r' (and '\n\r'?)
my @lines;
# (isn't perl great!!)
@lines = split /\r\n|\n\r|\n|\r/, $body;
return ( $code, $redircount, @lines );
}
# now, utility routines for a gwebcache...
my %hostsfields =
(
ipport => '0.0.0.0:0', # well we have to have something in there by default
client => 'unknown',
time => 0,
);
sub gethosts
{
my ( $network ) = @_;
my $file = "$prefix-hosts".($network eq 'gnutella' ? "" : "-$network");
my @hosts = loadarrayhash $file;
fixarrayfields \%hostsfields, \@hosts;
# TODO: remove old/bad entries
return @hosts;
}
sub addhost
{
my ( $network, $ipport, $client ) = @_;
my $file = "$prefix-hosts".($network eq 'gnutella' ? "" : "-$network");
my @hosts = loadarrayhash $file;
fixarrayfields \%hostsfields, \@hosts;
my $alreadyhave = 0;
foreach my $loopipport ( @hosts )
{
if ( ${$loopipport}{ipport} eq $ipport )
{
$alreadyhave = 1;
}
}
# just return if we already have it
# NOTE: we should probably update it's timestamp, but it isn't really worth it
return if $alreadyhave;
# add entry to start of list
unshift @hosts, { ipport => $ipport, client => $client, time => $time };
# remove extra hosts
splice @hosts, $maxhost if @hosts > $maxhost;
# and max age
while ( scalar @hosts and $hosts[scalar @hosts - 1]{time} < $time - $maxhostage*60 )
{
pop @hosts;
}
savearrayhash $file, @hosts;
}
#sub pickhosts
#{
# my ( $client, @hosts ) = @_;
#
# $client = 'no such client' if $client eq 'unknown';
#
# my %done;
#
# my @newhosts;
#
# # first, pick hosts of the same client type...
# foreach my $item ( @hosts )
# {
# next if ${$item}{client} ne $client;
# next if ${$item}{time} < $time - $maxhostage*60;
#
# push @newhosts, $item;
# $done{${$item}{ipport}} = 1;
# last if @newhosts >= $maxhostret * $hostsameclientratio;
# }
#
# # now pick some more hosts - client not important (could be more of the same client)
# foreach my $item ( @hosts )
# {
# next if defined $done{${$item}{ipport}};
# next if ${$item}{time} < $time - $maxhostage*60;
#
# push @newhosts, $item;
# last if @newhosts >= $maxhostret;
# }
#
# return @newhosts;
#}
my %urlfields =
(
url => 'http://localhost/ignore',
time => 0,
version => 'unknown',
);
sub geturls
{
my ( $cachetype, $network ) = @_;
my $file = "$prefix-urls".($cachetype == 1 ? "" : "-$cachetype");
$file .= "-$network" if defined $network;
my @urls = loadarrayhash $file;
fixarrayfields \%urlfields, \@urls;
# first, remove any URLs which are too old...
# NOTE: URLs are stored newest to oldest, so we can just repeatedly drop the last one
while ( @urls > 1 and $urls[@urls-1]{time} < $time - $maxurlage*60 )
{
splice @urls, -1; # remove last entry
}
return @urls;
}
sub addurl
{
my ( $cachetype, $url, $network ) = @_;
my $file = "$prefix-urls".($cachetype == 1 ? "" : "-$cachetype");
$file .= "-$network" if defined $network;
my @urls = loadarrayhash $file;
fixarrayfields \%urlfields, \@urls;
# first, remove any URLs which are too old...
# NOTE: URLs are stored newest to oldest, so we can just repeatedly drop the last one
while ( @urls > 1 and $urls[@urls-1]{time} < $time - $maxurlage*60 )
{
splice @urls, -1; # remove last entry
}
my $alreadyhave = 0;
foreach my $loopurl ( @urls )
{
if ( ${$loopurl}{url} eq $url )
{
$alreadyhave = 1;
}
}
# just return if we already have it
# NOTE: we should really update it's timestamp, but it isn't really worth it
return if $alreadyhave;
my $urlcachever = 'unknown';
if ( $checkurls )
{
my $requrl = $url;
if ( $cachetype == 1 )
{
$requrl .= "?client=TEST&version=pgc-$cacheversion&ping=1";
}
else
{
# sending a ping isn't defined in the current GWC2 specs - so send a
# get as well to ensure we trigger the correct mode of a dual-version cache
$requrl .= "?client=TESTpgc-$cacheversion&ping=1&get=1";
$requrl .= "&net=$network" if defined $network;
}
my ( $code, undef, @lines ) = do_http_text_request $requrl;
if ( $code ne '200' )
{
$urlcachever = 'FAILED';
}
else
{
if ( $cachetype == 1 )
{
# v1 test - at least one line, and the first line starts with PONG
if ( @lines >= 1 and $lines[0] =~ m{^PONG\s*([a-zA-Z0-9. _/+-]{0,100})} )
{
$urlcachever = $1 if $1 ne ''; # ignore if blank
}
else
{
$urlcachever = 'FAILED';
}
}
else
{
# v2 test - each line must be of the form '[a-zA-Z]|.*'
$urlcachever = 'FAILED' if @lines == 0;
foreach my $line ( @lines )
{
chomp $line;
if ( $line =~ m{^[iI]\|[pP][oO][nN][gG]\|([a-zA-Z0-9. _/+-]{0,100})} )
{
$urlcachever = $1 if $1 ne '';
}
elsif ( $line !~ m/^[a-zA-Z0-9]\|/ )
{
$urlcachever = 'FAILED';
last;
}
}
}
}
}
# add entry
unshift @urls, { url => $url, time => $time, version => $urlcachever };
# remove extra urls
splice @urls, $maxurlstore if @urls > $maxurlstore;
savearrayhash $file, @urls;
}
sub pickurls
{
my ( @urls ) = @_;
my @newurls;
while ( @urls >= 1 and @newurls < $maxurlret )
{
my $rand = int ( rand @urls );
my $item = splice @urls, $rand, 1; # pick an item from the list and remove it
next if ${$item}{version} eq 'FAILED'; # ignore failed URLs
push @newurls, $item;
}
return @newurls;
}
sub getnetworks
{
my %networks = ();
# get the networks from the hosts files...
foreach my $file ( <$prefix-hosts*> )
{
# nasty hack to allow for no -gnutella suffix special case
$file = "$prefix-hosts-gnutella" if $file eq "$prefix-hosts";
$networks{substr $file, length "$prefix-hosts-"} = 1;
}
# .. and the url files - for the gwc2 network only
foreach my $file ( <$prefix-urls-2-*> )
{
# nasty hack not needed - we always append the network name for gwc2 url databases
# $file = "$prefix-urls-gnutella" if $file eq "$prefix-urls";
$networks{substr $file, length "$prefix-urls-2-"} = 1;
}
return sort keys %networks;
}
sub statlog
{
return unless $enablestats;
my ( $day, $action ) = @_;
# NOTE: loadhashhash REQUIRES that all of the hashes have the same members, or data WILL NOT be saved correctly,
# so use this to check...
my %stattypes =
(
update => 1,
hostfile => 1,
urlfile => 1,
statfile => 1,
ping => 1,
badrequest => 1,
block_requestrate => 1,
block_anon => 1,
update_block_badport => 1,
update_block_badip => 1,
update_block_ratelimit => 1,
block_badpath => 1,
# v2 actions are any combination of get, ping and update - always sorted
v2_get => 1,
v2_ping => 1,
v2_update => 1,
v2_get_ping => 1,
v2_get_update => 1,
v2_ping_update => 1,
v2_get_ping_update => 1,
v2_block_anon => 1,
v2_update_block_badport => 1,
v2_update_block_badip => 1,
v2_update_block_badurl => 1,
v2_update_block_ratelimit => 1,
);
# make sure this stat type exists...
return unless defined $stattypes{$action};
my %stats = loadhashhash "$prefix-stats";
# add in this stat item...
$stats{$day}{$action} = 0 unless defined $stats{$day}{$action};
$stats{$day}{$action}++;
# and also into the totals...
$stats{'total'}{$action} = 0 unless defined $stats{$day}{$action};
$stats{'total'}{$action}++;
foreach my $statday ( keys %stats )
{
# now remove old stat data...
if ( $statday ne 'total' and $statday < $day - 28 )
{
delete $stats{$statday};
next;
}
# remove bogus stats...
foreach my $stattype ( keys %{$stats{$statday}} )
{
delete $stats{$statday}{$stattype} unless defined $stattypes{$stattype};
}
# and add unset stats...
foreach my $stattype ( keys %stattypes )
{
$stats{$statday}{$stattype} = 0 unless defined $stats{$statday}{$stattype};
}
}
savehashhash "$prefix-stats", %stats;
}
# returns an array of ( $client, $version ) from a decoded cgi parameter hash
# allows for both v1 client=XXX&version=a.b.c and v2 client=XXXXa.b.c style versions...
sub getclientandversion
{
my ( %p ) = @_;
# the V2 specs say client and version should be combined into a single parameter - work around that here...
if ( defined $p{client} and !defined $p{version} and $p{client} =~ m/^([A-Za-z0-9]{4})(.+)$/ )
{
$p{client} = $1;
$p{version} = $2;
}
my $client;
if ( !defined $p{client} )
{
$client = "unknown";
}
elsif ( $p{client} =~ m/^[a-zA-Z0-9.+-]{1,20}$/ )
{
$client = $p{client};
}
else
{
$client = "unknown_bad";
}
my $version;
if ( !defined $p{version} )
{
$version = "unknown";
}
elsif ( $p{version} =~ m{^[a-zA-Z0-9/.+ @\-]{1,40}$} )
{
$version = $p{version};
}
else
{
$version = "unknown_bad";
}
return ( $client, $version );
}
###### main code ######
# some subs, to break up the code a little...
sub dohtmlpage
{
my ( %param ) = @_;
$| = 1;
my $browse = "index";
$browse = $param{browse} if defined $param{browse} and $browse =~ m/^[a-zA-Z0-9]{1,100}$/;
print "Content-type: text/html\n";
print "\n";
print <perlgcache $cacheversion
perlgcache $cacheversion
[
home |
hosts |
urls |
limited IPs |
stats
]
END
# right - it's a good idea to test the data directory is writable here - let's create then delete a file
my $ok = 0;
my $tmpfile = "$prefix-writetest.$$.tmp";
if ( open OUT, "> $tmpfile" )
{
close OUT;
unlink $tmpfile;
$ok = 1;
}
if ( ! $ok )
{
print "
\n";
print "Datafile write failure
\n";
print "This cache is not correctly configured!
\n";
print "You need to configure \$prefix to point to a writable directory.
\n";
print "
\n";
}
if ( $browse eq 'stats' )
{
if ( $enablestats )
{
my %stats = loadhashhash "$prefix-stats";
print "Cache stats\n";
print " | \n";
foreach my $stattype ( sort keys %{$stats{'total'}} )
{
next if $stats{'total'}{$stattype} == 0;
my $statdisp = $stattype;
$statdisp =~ s/_/ /g;
print "$statdisp | ";
}
print " | Total | ";
print "
\n";
foreach my $statday ( sort keys %stats )
{
print " |
\n" if $statday eq 'total';
my $daytext = "Total";
$daytext = scalar gmtime ($statday*60*60*24) unless $statday eq 'total';
$daytext =~ s/00:00:00 //;
$daytext =~ s/ / /g;
print "| $daytext | ";
my $total = 0;
foreach my $stattype ( sort keys %{$stats{'total'}} )
{
next if $stats{'total'}{$stattype} == 0;
print "$stats{$statday}{$stattype} | ";
$total += $stats{$statday}{$stattype};
}
print " | $total | ";
print "
\n";
}
print "
\n";
}
}
elsif ( $browse eq 'hosts' )
{
foreach my $network ( getnetworks )
{
my @hosts = gethosts $network;
print "Host cache - net=$network\n";
print "| IP:Port | Client | Time |
\n";
foreach my $item ( @hosts )
{
next if ${$item}{time} < $time - $maxhostage*60;
print "| ${$item}{ipport} | ${$item}{client} | ".(scalar gmtime ${$item}{time})." |
\n";
}
print "| no hosts |
\n" if scalar @hosts == 0;
print "Maximum of $maxhost hosts stored/returned. \n";
print "Hosts are no older than $maxhostage minutes |
\n";
print "
\n";
}
}
elsif ( $browse eq 'urls' )
{
my @urls = geturls 1;
if ( $enablev1requests )
{
print "URL cache - GWebcache\n";
print "| URL | Version | Time |
\n";
foreach my $item ( @urls )
{
print "| ${$item}{url} | ${$item}{version} | ".(scalar gmtime ${$item}{time})." |
\n";
}
print "| no urls |
\n" if scalar @urls == 0;
print "
\n";
}
if ( $enablev2requests )
{
foreach my $network ( getnetworks )
{
my @urls = geturls 2, $network; ##loadarrayhash "$prefix-urls";
print "URL cache - GWebcache2 - net=$network\n";
print "| URL | Version | Time |
\n";
foreach my $item ( @urls )
{
print "| ${$item}{url} | ${$item}{version} | ".(scalar gmtime ${$item}{time})." |
\n";
}
print "| no urls |
\n" if scalar @urls == 0;
print "
\n";
}
}
}
elsif ( $browse eq 'limited' )
{
my %updateip = loadhashhash "$prefix-updateip";
print "Recent updates\n";
print "| Client IP | Update at |
\n";
foreach my $ip ( sort { $updateip{$b}{time} <=> $updateip{$a}{time} } keys %updateip )
{
next if $updateip{$ip}{time} < $time - $updatelimit*60;
print "| $ip | ".(scalar gmtime $updateip{$ip}{time})." |
\n";
}
print "| No updates for another $updatelimit minutes |
\n";
print "| no blocked IPs |
\n" if scalar keys %updateip == 0;
print "
\n";
if ( $requestlimitcount and $requestlimittime )
{
my %requestip = loadhashhash "$prefix-requestip";
print "Recent requests\n";
print "| Client IP | count | time |
\n";
my %lowcount = ();
foreach my $ip ( sort { $requestip{$b}{count} <=> $requestip{$a}{count} } keys %requestip )
{
next if $requestip{$ip}{time} < $time - $requestlimittime*60;
if ( $requestip{$ip}{count} <= $requestlimitcount )
{
$lowcount{$requestip{$ip}{count}}++;
}
else
{
print "| $ip | $requestip{$ip}{count} | ".(scalar gmtime $requestip{$ip}{time})." |
\n";
}
}
foreach my $count ( sort { $b <=> $a } keys %lowcount )
{
print "| $lowcount{$count} IP(s) have made $count request(s) in the last $requestlimittime minutes |
\n";
}
print "| No more than $requestlimitcount requests every $requestlimittime minutes |
\n";
print "
\n";
}
}
else
{
print <Welcome to this gnutella
web cache.
perlgcache
by Jon Atkins
END
}
print "Current time: ".(scalar gmtime $time)."
\n";
print <
END
}
sub dov1request
{
my ( %param ) = @_;
my ( $client, $version ) = getclientandversion %param;
my $network = 'gnutella'; ## NOTE: default net name - supplying no net= param is the same as sending this value
## $network = lc $1 if defined $param{net} and $param{net} =~ m/^([a-zA-Z0-9._-]{1,20})$/;
## # (NOTE: above restrictions are the same as I use for the client string - no particular reason for this)
if ( $blockv1anon )
{
# unless ( defined $param{client} and defined $param{version} )
if ( $client =~ m/^unknown/ ) # just check for client
{
statlog $day, "block_anon";
error "anonymous clients not allowed here";
}
}
# bodge: works around differences with an older version of the specs
$param{url} = $param{url1} if !defined $param{url} and defined $param{url1};
$param{ip} = $param{ip1} if !defined $param{ip} and defined $param{ip1};
if ( defined $param{url} or defined $param{ip} )
{
# limit update rate by remembering the cache...
my %updateip = loadhashhash "$prefix-updateip";
foreach my $ip ( keys %updateip )
{
if ( $updateip{$ip}{time} < $time - $updatelimit*60 )
{
delete $updateip{$ip};
next;
}
if ( $ENV{REMOTE_ADDR} eq $ip )
{
statlog $day, "update_block_ratelimit";
print "OK\n";
print "WARNING: update denied: $ENV{REMOTE_ADDR} has sent updates in the last $updatelimit minutes\n";
exit;
}
}
$updateip{$ENV{REMOTE_ADDR}}{time} = $time;
savehashhash "$prefix-updateip", %updateip;
print "OK\n";
if ( defined $param{ip} )
{
my $ipport = $param{ip};
if ( $ipport =~ m{^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})$} )
{
my ( $ip, $port ) = ( $1, $2 );
if ( $port < 1 or $port > 65535 )
{
statlog $day, "update_block_badport";
error "port $port out of range";
}
my $iptype = getiptype $ip;
# TODO: allow 'private' addresses in certain situations??
unless ( $iptype eq 'public' )
{
statlog $day, "update_block_badip";
error "ip $ip invalid ($iptype)";
}
unless( $client eq "MUTE" ) {
error "client type $client invalid";
}
addhost "gnutella", $ipport, $client;
}
else
{
print "WARNING: invalid format ip parameter\n";
}
}
if ( defined $param{url} )
{
my $url = $param{url};
# bodge to change %7e to ~
$url =~ s{%7[eE]}{~}g;
if ( $url =~ m{^http://[a-zA-Z0-9.-]{1,200}(:\d{1,5})?/[a-zA-Z0-9/.~_-]{0,250}$} )
{
addurl 1, $url; # add url to the gwebcache v1 list...
}
else
{
print "WARNING: invalid format/character in url\n";
}
}
statlog $day, "update";
}
elsif ( defined $param{hostfile} )
{
my @hosts = gethosts $network; ##loadarrayhash "$prefix-hosts";
# @hosts = pickhosts $client, @hosts;
my $testclient = $client eq 'unknown' ? 'not set' : $client;
my $outcount = 0;
foreach my $item ( @hosts )
{
print "${$item}{ipport}\n";
}
# hack: the v1 cache specs allows caches to return no lines, but some clients
# assume this is an error. let's always send something, even if it's useless
# NOTE: this only happens when the cache has no hosts
if ( @hosts == 0 )
{
print "0.0.0.0:0\n"; # an invalid IP/port but should match the format clients are expecting
}
statlog $day, "hostfile";
}
elsif ( defined $param{urlfile} )
{
my @urls = geturls 1; # load gwebcache v1 list
@urls = pickurls @urls;
foreach my $item ( @urls )
{
print "${$item}{url}\n";
}
# hack: the v1 cache specs allows caches to return no lines, but some clients
# assume this is an error. let's always send something, even if it's useless
# NOTE: this only happens when the cache has no valid URLs
if ( @urls == 0 )
{
print "http://0.0.0.0:0/\n"; # an invalid IP/port but should match the format clients are expecting
}
statlog $day, "urlfile";
}
elsif ( defined $param{ping} )
{
print "PONG perlgcache/$cacheversion\n";
statlog $day, "ping";
}
elsif ( defined $param{statfile} )
{
if ( $enablestats )
{
my %stats = loadhashhash "$prefix-stats";
my ( $totalreq, $dayreq, $dayupd ) = ( 0, 0, 0 );
my $statday = $day - 1;
# type1: count all requests for the requests figure...
foreach my $statitem ( keys %{$stats{'total'}} )
## # type2: count just hostfile and urlfile requests
## foreach my $statitem ( 'hostfile', 'urlfile' )
{
$dayreq += $stats{$statday}{$statitem} if defined $stats{$statday};
$totalreq += $stats{'total'}{$statitem};
}
$dayupd += $stats{$statday}{'update'} if defined $stats{$statday};
$dayupd += $stats{$statday}{'v2_update'} if defined $stats{$statday};
print $totalreq."\n";
# NOTE: the statfile=1 specs ask for hour stats, but we only store day stats... fake them...
print int($dayreq/24)."\n";
print int($dayupd/24)."\n";
}
else
{
print "NOTE: statfile not supported\n";
}
statlog $day, "statfile";
}
else
{
statlog $day, "badcommand";
error "unknown command";
}
}
sub dov2request
{
my ( %param ) = @_;
my ( $client, $version ) = getclientandversion %param;
my $network = 'gnutella'; ## NOTE: default net name - supplying no net= param is the same as sending this value
$network = lc $1 if defined $param{net} and $param{net} =~ m/^([a-zA-Z0-9._-]{1,20})$/;
# (NOTE: above restrictions are the same as I use for the client string - no particular reason for this)
if ( 1 ) # always enforce client parameter for v2 requests
{
if ( $client =~ m/^unknown/ ) # just check for client
{
statlog $day, "v2_block_anon";
error "V2 Anonymous clients not allowed here";
}
}
# OK - the v2 specs say that a cache must always return something - returning nothing is an error
# they also specify a response to a ping but not the query to make!
# there's also the fact that unwanted info can always be returned
# so... - let's always output our ping line...
print "i|pong|perlgcache/$cacheversion\n";
my @actions;
if ( defined $param{ping} )
{
push @actions, "ping";
}
if ( defined $param{update} )
{
# limit update rate by remembering the cache...
my %updateip = loadhashhash "$prefix-updateip";
foreach my $ip ( keys %updateip )
{
if ( $updateip{$ip}{time} < $time - $updatelimit*60 )
{
delete $updateip{$ip};
next;
}
if ( $ENV{REMOTE_ADDR} eq $ip )
{
statlog $day, "update_block_ratelimit";
print "i|update|WARNING|You came back too soon\n";
exit;
}
}
$updateip{$ENV{REMOTE_ADDR}}{time} = $time;
savehashhash "$prefix-updateip", %updateip;
if ( defined $param{ip} )
{
my $ipport = $param{ip};
if ( $ipport =~ m{^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})$} )
{
my ( $ip, $port ) = ( $1, $2 );
if ( $port < 1 or $port > 65535 )
{
statlog $day, "v2_update_block_badport";
print "i|update|WARNING|Rejected IP\n";
print "d|update|ip|port $port out of range\n";
return;
}
my $iptype = getiptype $ip;
# TODO: allow 'private' addresses in certain situations??
unless ( $iptype eq 'public' )
{
statlog $day, "v2_update_block_badip";
print "i|update|WARNING|Rejected IP\n";
print "d|update|ip|ip $ip isn't public - $iptype\n";
}
addhost $network, $ipport, $client;
print "i|update|OK\n";
print "d|update|ip|ip=$ip:$port net=$network accepted\n";
}
}
if ( defined $param{url} )
{
my $url = $param{url};
# bodge to change %7e to ~
$url =~ s{%7[eE]}{~}g;
if ( $url =~ m{^http://[a-zA-Z0-9.-]{1,200}(:\d{1,5})?/[a-zA-Z0-9/.~_-]{0,250}$} )
{
addurl 2, $url, $network; # add url to the gwebcache v2 list...
print "i|update|OK\n";
print "d|update|url|url=$url added to gwc2 list\n";
}
else
{
statlog $day, "v2_update_block_badurl";
print "i|update|WARNING|Rejected URL\n";
return;
}
}
push @actions, "update";
}
if ( $param{get} )
{
# first, do the hosts list...
my @hosts = gethosts $network;
# @hosts = pickhosts $client, @hosts;
foreach my $item ( @hosts )
{
my $age = $time - ${$item}{time};
$age = 0 if $age < 0;
print "h|${$item}{ipport}|$age|${$item}{client}\n";
}
# now do the urls...
my @urls = geturls 2, $network;
@urls = pickurls @urls;
foreach my $item ( @urls )
{
my $age = $time - ${$item}{time};
$age = 0 if $age < 0;
print "u|${$item}{url}|$age\n";
}
push @actions, "get";
}
if ( defined $param{support} )
{
my @networks = getnetworks;
foreach my $network ( @networks )
{
print "i|SUPPORT|$network\n";
}
print "i|SUPPORT|*\n"; # show we support any unknown network
}
statlog $day, "v2_".(join "_", sort @actions);
}
sub dorequest
{
# globals: read time once - to avoid possible race conditions
$time = time;
$day = int ( $time / (24*60*60) );
unless ( defined $ENV{REQUEST_METHOD} and $ENV{REQUEST_METHOD} eq 'GET' )
{
print "Content-type: text/plain\n";
print "Cache-control: no-cache\n";
print "Connection: close\n";
print "\n";
error "only GET requests supported"
}
my %param = paramdecode $ENV{QUERY_STRING};
if ( scalar %param eq "0" or defined $param{browse} )
{
dohtmlpage %param;
exit;
}
print "Content-type: text/plain\n";
print "Cache-control: no-cache\n";
print "Connection: close\n";
print "\n";
# to avoid multiple paths to this script (eg .../perlgcache.cgi and .../perlgcache.cgi/)
if ( defined $ENV{PATH_INFO} and $ENV{PATH_INFO} ne '' )
{
statlog $day, "block_badpath";
error "invalid path to script (PATH_INFO = $ENV{PATH_INFO})";
}
# block http/0.9 requests
# (there's one anonymous client which just sends "GET http://host/path/script\r\n")
if ( 1 )
{
if ( defined $ENV{SERVER_PROTOCOL} and $ENV{SERVER_PROTOCOL} eq 'HTTP/0.9' )
{
statlog $day, "block_http0.9";
error "HTTP/0.9 requests not supported. read RFC2616 or at least RFC1945";
}
}
if ( $requestlimitcount and $requestlimittime )
{
my %requestip = loadhashhash "$prefix-requestip";
# first, remove old entries...
foreach my $ip ( keys %requestip )
{
if ( $requestip{$ip}{time} < $time - $requestlimittime*60 )
{
delete $requestip{$ip};
next;
}
}
# add a blank entry if required...
unless ( defined $requestip{$ENV{REMOTE_ADDR}} )
{
$requestip{$ENV{REMOTE_ADDR}}{count} = 0;
$requestip{$ENV{REMOTE_ADDR}}{time} = $time;
}
# increment the counter...
$requestip{$ENV{REMOTE_ADDR}}{count}++;
# and save it back...
savehashhash "$prefix-requestip", %requestip;
if ( $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount )
{
statlog $day, "block_requestrate";
# start adding delays to high request rate clients. the theory
# is that even abusive clients still wait for a request to complete
# before making another request
sleep 5 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*2;
sleep 10 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*3;
sleep 15 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*5;
# if over 10 times the standard limit, then exit without a message (saves a few bytes)
exit if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*10;
error "request rate of $requestlimitcount requests every $requestlimittime minutes exceeded for $ENV{REMOTE_ADDR}\n";
}
}
if ( $enablev2requests and
(
!$enablev1requests or
defined $param{get} or
defined $param{update} or
defined $param{net} or
(defined $param{ping} and $param{ping} eq '2')
)
)
{
dov2request %param;
}
else
{
dov1request %param;
}
}
# regular CGI
dorequest;
# NOTE: not working yet - there's several cases where the above code does an 'exit' instead
# of a 'return', so it won't work well...
##FCGI### FastCGI
##FCGI##use FCGI;
##FCGI##my $fcgi = FCGI::Request();
##FCGI##while ( $fcgi->Accept() >= 0 )
##FCGI##{
##FCGI## dorequest;
##FCGI## $fcgi->Finish();
##FCGI## # exit if script file changes...
##FCGI## last if -M $ENV{SCRIPT_FILENAME} < 0;
##FCGI##}