#!/usr/bin/perl -wT use strict; # Copyright (c) 2003 Jon Atkins http://www.jonatkins.com/ # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. my $cacheversion = "0.6.4"; ###### options ###### # this is added to the start of all files loaded/saved # NOTE: can be a full path or a relative path # NOTE: the cache MUST have write-access to the specified directory # eg. my $prefix = "/tmp/gcache.example.com" would store files in "/tmp/gcache.example.com-*" # eg. my $prefix = "data/data" would store files in a data directory below the location of the CGI script my $prefix = "../cgi-data/gwebcache/data"; # will use "data-*" in the same directory as this cgi script # a large number of URLs are stored, and clients get a random selection # returned on each request. # URLs are removed from the list once they are $maxurlage minutes old # if URL testing is enabled (see $checkurls below) then caches # are tested with a ping request. failed caches are also stored to prevent # repeated retests on additional updates # return this many alternate cache urls my $maxurlret = 10; # suggested range: between 10 and 20 # store this many URLs my $maxurlstore = 200; # suggested range: 200 to 500 # remove URLs after this many minutes my $maxurlage = 1*24*60; # in minutes - default 1 day # number of hosts to keep and return my $maxhost = 20; # maximum age of a host entry, in minutes my $maxhostage = 6*60; # in minutes - default 12 hours by default # set to non-zero to block clients which don't send a client and version parameter # (this setting only applys to v1 requests - v2 requests require a client string) # NOTE: in the original cache specs sending client/version was optional my $blockv1anon = 0; # log cache stats my $enablestats = 1; # minimum time, in minutes, between updates my $updatelimit = 55; # suggested range: 30 - 55 minutes # check each URL submitted works my $checkurls = 0; # HTTP proxy server to use when $checkurls is 1 - undef for no proxy my $proxyhost = undef; my $proxyport = undef; # request rate limiting # set either to zero to disable this feature my $requestlimitcount = 0; # suggested value: 0, or 5 - 10 my $requestlimittime = 55; # suggested maximum 120 (2 hours) # should we answer old-style v1 requests? my $enablev1requests = 1; # EXPERIMENTAL: see the notes at http://www.jonatkins.com/perlgcache/ # before enabling this feature. my $enablev2requests = 0; # note: if both v1/v2 are set to 0, act as if v1 is enabled ###### end of configuration ###### # NO USER OPTIONS AFTER HERE use IO::Socket; # for testing of sent URLs ###### global vars ##### # globals: read time once - to avoid possible race conditions # note: values set at the top of dorequest() - ready for FastCGI version my $time; my $day; ###### subroutines ###### # first, generic routines... sub error { my ( @args ) = @_; print "ERROR: ".(join ' ', @args)."\n"; exit; } sub paramdecode { my ( $query ) = @_; my %param; return () unless defined $query; foreach my $arg ( split /&/, $query ) { my ( $name, $val ) = split /=/, $arg; next unless defined $name and defined $val; $val =~ s/\+/ /g; $val =~ s/%([0-9a-fA-F]{2})/chr hex $1/eg; $param{$name} = $val; } return %param; } sub loadarrayhash { my ( $filename ) = @_; my @array; open IN, "$filename" or return (); my $header = ; return () unless defined $header; chomp $header; my @fields = split /\|/, $header; foreach my $line ( ) { chomp $line; my @items = split /\|/, $line, scalar @fields; my %item; for ( my $i=0; $i < @items; $i++ ) { $item{$fields[$i]} = defined $items[$i] ? $items[$i] : ''; } push @array, \%item; } close IN; #print Dumper ( \@array ); return @array; } sub savearrayhash { my ( $filename, @array ) = @_; #TODO: proper error checking in here, so we don't loose data files when the #disk is full # (build the file in a string then write in one go, to make this easier..? # may be better on file I/O performance this way) my $tmpfile = "$filename.tmp.$$"; open OUT, "> $tmpfile" or die; return if @array == 0; my @fields = sort keys %{$array[0]}; print OUT join "|", @fields; print OUT "\n"; foreach my $item ( @array ) { my $first = 1; foreach my $field ( @fields ) { print OUT "|" unless $first; $first = 0; my $val = ${$item}{$field}; $val = '' unless defined $val; print OUT "$val"; } print OUT "\n"; } close OUT; rename $tmpfile, $filename or error "rename of $tmpfile to $filename failed"; } sub loadhashhash { my ( $filename ) = @_; # right - let's get loadarrayhash to do the work here... my @array = loadarrayhash $filename; my %hash; foreach my $item ( @array ) { my $key = ${$item}{KEY}; $hash{$key} = $item; delete $hash{$key}{KEY}; } #print Dumper \%hash; return %hash; } sub savehashhash { my ( $filename, %hash ) = @_; #print Dumper \%hash; my @array; foreach my $item ( keys %hash ) { my %newitem; %newitem = %{$hash{$item}}; $newitem{KEY} = $item; push @array, \%newitem; } savearrayhash $filename, @array; } sub fixarrayfields { my ( $fieldsref, $arrayref ) = @_; foreach my $item ( @{$arrayref} ) { # add any missing fields... foreach my $field ( keys %{$fieldsref} ) { ${$item}{$field} = ${$fieldsref}{$field} unless defined ${$item}{$field}; } # .. and remove any not required... foreach my $field ( keys %{$item} ) { delete ${$item}{$field} unless exists ${$fieldsref}{$field}; } } } sub getiptype { my ( $ip ) = @_; #TODO: add support for IPv6 here... return "invalid" unless $ip =~ m/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/; my ( $a, $b, $c, $d ) = ( $1, $2, $3, $4 ); return "invalid" if ( $a<0 or $a>255 or $b<0 or $b>255 or $c<0 or $c>255 or $d<0 or $d>255 ); # don't allow 0.*.*.* (this net), 127.*.*.* (loopback) or 224-255.*.*.* (multicast, class-e, broadcast) return "unusable" if ( $a==0 or $a==127 or $a>=224 ); # mark 10.*.*.* (private a), 169.254.*.* (local link), 172.16-31.*.* (private b), 192.0.2.* (test net), 192.168.*.* (private c) return "private" if ( ($a==10) or ($a==169 and $b==254) or ($a==172 and ($b>=16 and $b<=31)) or ($a==192 and $b==0 and $c==2) or ($a==192 and $b==168) ); return "public"; } # HTTP request routines - for testing of URLs.... # used for the internals of http requests... sub socket_send_and_get { my ( $host, $port, @data ) = @_; my ( $remote, $recv ); $remote = IO::Socket::INET->new ( Proto => 'tcp', PeerAddr => $host, PeerPort => $port, Timeout => 60 ); return undef unless $remote; $remote->autoflush(1); print $remote @data; local $/ = undef; $recv = <$remote>; close $remote; alarm 0; return $recv; } # sends a http request to the given url and returns all data (header and body) as a single string. sub http_request { my ( $url ) = @_; if ( $url =~ m{^http://([a-zA-Z0-9-.]+)(?::(\d{1,5}))?(.*)$} ) { my $host = $1; my $port = $2; my $path = $3; $port = 80 unless defined $port; $path = '/' if $path eq ''; if ( defined $proxyhost and defined $proxyport ) { my $reply = socket_send_and_get $proxyhost, $proxyport, "GET $url HTTP/1.0\r\n", "Host: $host:$port\r\n", "User-agent: perlgcache/$cacheversion (via proxy)\r\n", "X-RequestingIP: $ENV{REMOTE_ADDR}\r\n", "\r\n"; return $reply; } my $reply = socket_send_and_get $host, $port, "GET $path HTTP/1.0\r\n", "Host: $host:$port\r\n", "User-agent: perlgcache/$cacheversion\r\n", "X-RequestingIP: $ENV{REMOTE_ADDR}\r\n", "\r\n"; return $reply; } return undef; } # does a http request for a text/* object, processes redirect as required # splits the output into an array of lines (without line terminators) and returns an array of: # ( $code, $redircount, @lines ) sub do_http_text_request { my ( $url, $redircount ) = @_; $redircount = 0 unless defined $redircount; # to stop redirect loops... return ( -4, $redircount ) if $redircount >= 10; my $reply = http_request $url; return ( -1, $redircount ) unless defined $reply; # HTTP headers should use \r\n to seperate lines - but for robustness clients can accept any format # i'll add that here later maybe my ( $header, $body ) = split /\r\n\r\n/, $reply, 2; return ( -5, $redircount ) unless defined $header and defined $body; my @header = split /\r\n/, $header; my $statusline = shift @header; return ( -2, $redircount ) unless defined $statusline; return ( -3, $redircount ) unless ( $statusline =~ m{^HTTP/\d+\.\d+ +(\d\d\d) *(.*)$} ); my ( $code, $message ) = ( $1, $2 ); if ( $code =~ m/^3\d\d$/ ) { # we've got a redirect - find the Location: header and follow it... foreach my $headerline ( @header ) { if ( $headerline =~ m{^location: +(.*)$}i ) { # right - location headers *should* be absolute, but sometimes they're not... my $newurl = $1; if ( $newurl =~ m{^http:} ) { # full path - no problem return do_http_text_request ( $newurl, $redircount+1 ); } elsif ( $newurl =~ m{^/} ) { # server relative url - add current server and port $url =~ m{^(http://.*?)/} or die; $newurl = $1.$newurl; return do_http_text_request ( $newurl, $redircount+1 ); } else { # directory relative url - add current server and path $url =~ m{^(http:.*/)} or die; $newurl = $1.$newurl; return do_http_text_request ( $newurl, $redircount+1 ); } } } # if we get here, there was no Location header... } if ( $code ne '200' ) { # bad code - fail return ( $code, $redircount ); } # right - we've got the data - now let's check the type... foreach my $headerline ( @header ) { if ( $headerline =~ m/^content-type: +(.*$)/i ) { my $type = $1; # error on non text/* content-types... error ( -6, $redircount ) unless ( $type =~ m{^text/.*$} ); } # (if we don't find a content-type, just assume it's text) } # it's good - now we need to sort out the body lines.... # fun, because they could be seperated with just '\n', '\r\n' or even '\r' (and '\n\r'?) my @lines; # (isn't perl great!!) @lines = split /\r\n|\n\r|\n|\r/, $body; return ( $code, $redircount, @lines ); } # now, utility routines for a gwebcache... my %hostsfields = ( ipport => '0.0.0.0:0', # well we have to have something in there by default client => 'unknown', time => 0, ); sub gethosts { my ( $network ) = @_; my $file = "$prefix-hosts".($network eq 'gnutella' ? "" : "-$network"); my @hosts = loadarrayhash $file; fixarrayfields \%hostsfields, \@hosts; # TODO: remove old/bad entries return @hosts; } sub addhost { my ( $network, $ipport, $client ) = @_; my $file = "$prefix-hosts".($network eq 'gnutella' ? "" : "-$network"); my @hosts = loadarrayhash $file; fixarrayfields \%hostsfields, \@hosts; my $alreadyhave = 0; foreach my $loopipport ( @hosts ) { if ( ${$loopipport}{ipport} eq $ipport ) { $alreadyhave = 1; } } # just return if we already have it # NOTE: we should probably update it's timestamp, but it isn't really worth it return if $alreadyhave; # add entry to start of list unshift @hosts, { ipport => $ipport, client => $client, time => $time }; # remove extra hosts splice @hosts, $maxhost if @hosts > $maxhost; # and max age while ( scalar @hosts and $hosts[scalar @hosts - 1]{time} < $time - $maxhostage*60 ) { pop @hosts; } savearrayhash $file, @hosts; } #sub pickhosts #{ # my ( $client, @hosts ) = @_; # # $client = 'no such client' if $client eq 'unknown'; # # my %done; # # my @newhosts; # # # first, pick hosts of the same client type... # foreach my $item ( @hosts ) # { # next if ${$item}{client} ne $client; # next if ${$item}{time} < $time - $maxhostage*60; # # push @newhosts, $item; # $done{${$item}{ipport}} = 1; # last if @newhosts >= $maxhostret * $hostsameclientratio; # } # # # now pick some more hosts - client not important (could be more of the same client) # foreach my $item ( @hosts ) # { # next if defined $done{${$item}{ipport}}; # next if ${$item}{time} < $time - $maxhostage*60; # # push @newhosts, $item; # last if @newhosts >= $maxhostret; # } # # return @newhosts; #} my %urlfields = ( url => 'http://localhost/ignore', time => 0, version => 'unknown', ); sub geturls { my ( $cachetype, $network ) = @_; my $file = "$prefix-urls".($cachetype == 1 ? "" : "-$cachetype"); $file .= "-$network" if defined $network; my @urls = loadarrayhash $file; fixarrayfields \%urlfields, \@urls; # first, remove any URLs which are too old... # NOTE: URLs are stored newest to oldest, so we can just repeatedly drop the last one while ( @urls > 1 and $urls[@urls-1]{time} < $time - $maxurlage*60 ) { splice @urls, -1; # remove last entry } return @urls; } sub addurl { my ( $cachetype, $url, $network ) = @_; my $file = "$prefix-urls".($cachetype == 1 ? "" : "-$cachetype"); $file .= "-$network" if defined $network; my @urls = loadarrayhash $file; fixarrayfields \%urlfields, \@urls; # first, remove any URLs which are too old... # NOTE: URLs are stored newest to oldest, so we can just repeatedly drop the last one while ( @urls > 1 and $urls[@urls-1]{time} < $time - $maxurlage*60 ) { splice @urls, -1; # remove last entry } my $alreadyhave = 0; foreach my $loopurl ( @urls ) { if ( ${$loopurl}{url} eq $url ) { $alreadyhave = 1; } } # just return if we already have it # NOTE: we should really update it's timestamp, but it isn't really worth it return if $alreadyhave; my $urlcachever = 'unknown'; if ( $checkurls ) { my $requrl = $url; if ( $cachetype == 1 ) { $requrl .= "?client=TEST&version=pgc-$cacheversion&ping=1"; } else { # sending a ping isn't defined in the current GWC2 specs - so send a # get as well to ensure we trigger the correct mode of a dual-version cache $requrl .= "?client=TESTpgc-$cacheversion&ping=1&get=1"; $requrl .= "&net=$network" if defined $network; } my ( $code, undef, @lines ) = do_http_text_request $requrl; if ( $code ne '200' ) { $urlcachever = 'FAILED'; } else { if ( $cachetype == 1 ) { # v1 test - at least one line, and the first line starts with PONG if ( @lines >= 1 and $lines[0] =~ m{^PONG\s*([a-zA-Z0-9. _/+-]{0,100})} ) { $urlcachever = $1 if $1 ne ''; # ignore if blank } else { $urlcachever = 'FAILED'; } } else { # v2 test - each line must be of the form '[a-zA-Z]|.*' $urlcachever = 'FAILED' if @lines == 0; foreach my $line ( @lines ) { chomp $line; if ( $line =~ m{^[iI]\|[pP][oO][nN][gG]\|([a-zA-Z0-9. _/+-]{0,100})} ) { $urlcachever = $1 if $1 ne ''; } elsif ( $line !~ m/^[a-zA-Z0-9]\|/ ) { $urlcachever = 'FAILED'; last; } } } } } # add entry unshift @urls, { url => $url, time => $time, version => $urlcachever }; # remove extra urls splice @urls, $maxurlstore if @urls > $maxurlstore; savearrayhash $file, @urls; } sub pickurls { my ( @urls ) = @_; my @newurls; while ( @urls >= 1 and @newurls < $maxurlret ) { my $rand = int ( rand @urls ); my $item = splice @urls, $rand, 1; # pick an item from the list and remove it next if ${$item}{version} eq 'FAILED'; # ignore failed URLs push @newurls, $item; } return @newurls; } sub getnetworks { my %networks = (); # get the networks from the hosts files... foreach my $file ( <$prefix-hosts*> ) { # nasty hack to allow for no -gnutella suffix special case $file = "$prefix-hosts-gnutella" if $file eq "$prefix-hosts"; $networks{substr $file, length "$prefix-hosts-"} = 1; } # .. and the url files - for the gwc2 network only foreach my $file ( <$prefix-urls-2-*> ) { # nasty hack not needed - we always append the network name for gwc2 url databases # $file = "$prefix-urls-gnutella" if $file eq "$prefix-urls"; $networks{substr $file, length "$prefix-urls-2-"} = 1; } return sort keys %networks; } sub statlog { return unless $enablestats; my ( $day, $action ) = @_; # NOTE: loadhashhash REQUIRES that all of the hashes have the same members, or data WILL NOT be saved correctly, # so use this to check... my %stattypes = ( update => 1, hostfile => 1, urlfile => 1, statfile => 1, ping => 1, badrequest => 1, block_requestrate => 1, block_anon => 1, update_block_badport => 1, update_block_badip => 1, update_block_ratelimit => 1, block_badpath => 1, # v2 actions are any combination of get, ping and update - always sorted v2_get => 1, v2_ping => 1, v2_update => 1, v2_get_ping => 1, v2_get_update => 1, v2_ping_update => 1, v2_get_ping_update => 1, v2_block_anon => 1, v2_update_block_badport => 1, v2_update_block_badip => 1, v2_update_block_badurl => 1, v2_update_block_ratelimit => 1, ); # make sure this stat type exists... return unless defined $stattypes{$action}; my %stats = loadhashhash "$prefix-stats"; # add in this stat item... $stats{$day}{$action} = 0 unless defined $stats{$day}{$action}; $stats{$day}{$action}++; # and also into the totals... $stats{'total'}{$action} = 0 unless defined $stats{$day}{$action}; $stats{'total'}{$action}++; foreach my $statday ( keys %stats ) { # now remove old stat data... if ( $statday ne 'total' and $statday < $day - 28 ) { delete $stats{$statday}; next; } # remove bogus stats... foreach my $stattype ( keys %{$stats{$statday}} ) { delete $stats{$statday}{$stattype} unless defined $stattypes{$stattype}; } # and add unset stats... foreach my $stattype ( keys %stattypes ) { $stats{$statday}{$stattype} = 0 unless defined $stats{$statday}{$stattype}; } } savehashhash "$prefix-stats", %stats; } # returns an array of ( $client, $version ) from a decoded cgi parameter hash # allows for both v1 client=XXX&version=a.b.c and v2 client=XXXXa.b.c style versions... sub getclientandversion { my ( %p ) = @_; # the V2 specs say client and version should be combined into a single parameter - work around that here... if ( defined $p{client} and !defined $p{version} and $p{client} =~ m/^([A-Za-z0-9]{4})(.+)$/ ) { $p{client} = $1; $p{version} = $2; } my $client; if ( !defined $p{client} ) { $client = "unknown"; } elsif ( $p{client} =~ m/^[a-zA-Z0-9.+-]{1,20}$/ ) { $client = $p{client}; } else { $client = "unknown_bad"; } my $version; if ( !defined $p{version} ) { $version = "unknown"; } elsif ( $p{version} =~ m{^[a-zA-Z0-9/.+ @\-]{1,40}$} ) { $version = $p{version}; } else { $version = "unknown_bad"; } return ( $client, $version ); } ###### main code ###### # some subs, to break up the code a little... sub dohtmlpage { my ( %param ) = @_; $| = 1; my $browse = "index"; $browse = $param{browse} if defined $param{browse} and $browse =~ m/^[a-zA-Z0-9]{1,100}$/; print "Content-type: text/html\n"; print "\n"; print <perlgcache $cacheversion

perlgcache $cacheversion

[ home | hosts | urls | limited IPs | stats ]

END # right - it's a good idea to test the data directory is writable here - let's create then delete a file my $ok = 0; my $tmpfile = "$prefix-writetest.$$.tmp"; if ( open OUT, "> $tmpfile" ) { close OUT; unlink $tmpfile; $ok = 1; } if ( ! $ok ) { print "
\n"; print "

Datafile write failure

\n"; print "

This cache is not correctly configured!

\n"; print "

You need to configure \$prefix to point to a writable directory.

\n"; print "
\n"; } if ( $browse eq 'stats' ) { if ( $enablestats ) { my %stats = loadhashhash "$prefix-stats"; print "\n"; print "\n"; foreach my $stattype ( sort keys %{$stats{'total'}} ) { next if $stats{'total'}{$stattype} == 0; my $statdisp = $stattype; $statdisp =~ s/_/ /g; print ""; } print ""; print "\n"; foreach my $statday ( sort keys %stats ) { print "\n" if $statday eq 'total'; my $daytext = "Total"; $daytext = scalar gmtime ($statday*60*60*24) unless $statday eq 'total'; $daytext =~ s/00:00:00 //; $daytext =~ s/ / /g; print ""; my $total = 0; foreach my $stattype ( sort keys %{$stats{'total'}} ) { next if $stats{'total'}{$stattype} == 0; print ""; $total += $stats{$statday}{$stattype}; } print ""; print "\n"; } print "
Cache stats
$statdispTotal
$daytext$stats{$statday}{$stattype}$total
\n"; } } elsif ( $browse eq 'hosts' ) { foreach my $network ( getnetworks ) { my @hosts = gethosts $network; print "\n"; print "\n"; foreach my $item ( @hosts ) { next if ${$item}{time} < $time - $maxhostage*60; print "\n"; } print "\n" if scalar @hosts == 0; print "\n"; print "
Host cache - net=$network
IP:PortClientTime
${$item}{ipport}${$item}{client}".(scalar gmtime ${$item}{time})."
no hosts
Maximum of $maxhost hosts stored/returned.
\n"; print "Hosts are no older than $maxhostage minutes
\n"; } } elsif ( $browse eq 'urls' ) { my @urls = geturls 1; if ( $enablev1requests ) { print "\n"; print "\n"; foreach my $item ( @urls ) { print "\n"; } print "\n" if scalar @urls == 0; print "
URL cache - GWebcache
URLVersionTime
${$item}{url}${$item}{version}".(scalar gmtime ${$item}{time})."
no urls
\n"; } if ( $enablev2requests ) { foreach my $network ( getnetworks ) { my @urls = geturls 2, $network; ##loadarrayhash "$prefix-urls"; print "\n"; print "\n"; foreach my $item ( @urls ) { print "\n"; } print "\n" if scalar @urls == 0; print "
URL cache - GWebcache2 - net=$network
URLVersionTime
${$item}{url}${$item}{version}".(scalar gmtime ${$item}{time})."
no urls
\n"; } } } elsif ( $browse eq 'limited' ) { my %updateip = loadhashhash "$prefix-updateip"; print "\n"; print "\n"; foreach my $ip ( sort { $updateip{$b}{time} <=> $updateip{$a}{time} } keys %updateip ) { next if $updateip{$ip}{time} < $time - $updatelimit*60; print "\n"; } print "\n"; print "\n" if scalar keys %updateip == 0; print "
Recent updates
Client IPUpdate at
$ip".(scalar gmtime $updateip{$ip}{time})."
No updates for another $updatelimit minutes
no blocked IPs
\n"; if ( $requestlimitcount and $requestlimittime ) { my %requestip = loadhashhash "$prefix-requestip"; print "\n"; print "\n"; my %lowcount = (); foreach my $ip ( sort { $requestip{$b}{count} <=> $requestip{$a}{count} } keys %requestip ) { next if $requestip{$ip}{time} < $time - $requestlimittime*60; if ( $requestip{$ip}{count} <= $requestlimitcount ) { $lowcount{$requestip{$ip}{count}}++; } else { print "\n"; } } foreach my $count ( sort { $b <=> $a } keys %lowcount ) { print "\n"; } print "\n"; print "
Recent requests
Client IPcounttime
$ip$requestip{$ip}{count}".(scalar gmtime $requestip{$ip}{time})."
$lowcount{$count} IP(s) have made $count request(s) in the last $requestlimittime minutes
No more than $requestlimitcount requests every $requestlimittime minutes
\n"; } } else { print <Welcome to this gnutella web cache.


perlgcache by Jon Atkins

END } print "

Current time: ".(scalar gmtime $time)."

\n"; print < END } sub dov1request { my ( %param ) = @_; my ( $client, $version ) = getclientandversion %param; my $network = 'gnutella'; ## NOTE: default net name - supplying no net= param is the same as sending this value ## $network = lc $1 if defined $param{net} and $param{net} =~ m/^([a-zA-Z0-9._-]{1,20})$/; ## # (NOTE: above restrictions are the same as I use for the client string - no particular reason for this) if ( $blockv1anon ) { # unless ( defined $param{client} and defined $param{version} ) if ( $client =~ m/^unknown/ ) # just check for client { statlog $day, "block_anon"; error "anonymous clients not allowed here"; } } # bodge: works around differences with an older version of the specs $param{url} = $param{url1} if !defined $param{url} and defined $param{url1}; $param{ip} = $param{ip1} if !defined $param{ip} and defined $param{ip1}; if ( defined $param{url} or defined $param{ip} ) { # limit update rate by remembering the cache... my %updateip = loadhashhash "$prefix-updateip"; foreach my $ip ( keys %updateip ) { if ( $updateip{$ip}{time} < $time - $updatelimit*60 ) { delete $updateip{$ip}; next; } if ( $ENV{REMOTE_ADDR} eq $ip ) { statlog $day, "update_block_ratelimit"; print "OK\n"; print "WARNING: update denied: $ENV{REMOTE_ADDR} has sent updates in the last $updatelimit minutes\n"; exit; } } $updateip{$ENV{REMOTE_ADDR}}{time} = $time; savehashhash "$prefix-updateip", %updateip; print "OK\n"; if ( defined $param{ip} ) { my $ipport = $param{ip}; if ( $ipport =~ m{^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})$} ) { my ( $ip, $port ) = ( $1, $2 ); if ( $port < 1 or $port > 65535 ) { statlog $day, "update_block_badport"; error "port $port out of range"; } my $iptype = getiptype $ip; # TODO: allow 'private' addresses in certain situations?? unless ( $iptype eq 'public' ) { statlog $day, "update_block_badip"; error "ip $ip invalid ($iptype)"; } unless( $client eq "MUTE" ) { error "client type $client invalid"; } addhost "gnutella", $ipport, $client; } else { print "WARNING: invalid format ip parameter\n"; } } if ( defined $param{url} ) { my $url = $param{url}; # bodge to change %7e to ~ $url =~ s{%7[eE]}{~}g; if ( $url =~ m{^http://[a-zA-Z0-9.-]{1,200}(:\d{1,5})?/[a-zA-Z0-9/.~_-]{0,250}$} ) { addurl 1, $url; # add url to the gwebcache v1 list... } else { print "WARNING: invalid format/character in url\n"; } } statlog $day, "update"; } elsif ( defined $param{hostfile} ) { my @hosts = gethosts $network; ##loadarrayhash "$prefix-hosts"; # @hosts = pickhosts $client, @hosts; my $testclient = $client eq 'unknown' ? 'not set' : $client; my $outcount = 0; foreach my $item ( @hosts ) { print "${$item}{ipport}\n"; } # hack: the v1 cache specs allows caches to return no lines, but some clients # assume this is an error. let's always send something, even if it's useless # NOTE: this only happens when the cache has no hosts if ( @hosts == 0 ) { print "0.0.0.0:0\n"; # an invalid IP/port but should match the format clients are expecting } statlog $day, "hostfile"; } elsif ( defined $param{urlfile} ) { my @urls = geturls 1; # load gwebcache v1 list @urls = pickurls @urls; foreach my $item ( @urls ) { print "${$item}{url}\n"; } # hack: the v1 cache specs allows caches to return no lines, but some clients # assume this is an error. let's always send something, even if it's useless # NOTE: this only happens when the cache has no valid URLs if ( @urls == 0 ) { print "http://0.0.0.0:0/\n"; # an invalid IP/port but should match the format clients are expecting } statlog $day, "urlfile"; } elsif ( defined $param{ping} ) { print "PONG perlgcache/$cacheversion\n"; statlog $day, "ping"; } elsif ( defined $param{statfile} ) { if ( $enablestats ) { my %stats = loadhashhash "$prefix-stats"; my ( $totalreq, $dayreq, $dayupd ) = ( 0, 0, 0 ); my $statday = $day - 1; # type1: count all requests for the requests figure... foreach my $statitem ( keys %{$stats{'total'}} ) ## # type2: count just hostfile and urlfile requests ## foreach my $statitem ( 'hostfile', 'urlfile' ) { $dayreq += $stats{$statday}{$statitem} if defined $stats{$statday}; $totalreq += $stats{'total'}{$statitem}; } $dayupd += $stats{$statday}{'update'} if defined $stats{$statday}; $dayupd += $stats{$statday}{'v2_update'} if defined $stats{$statday}; print $totalreq."\n"; # NOTE: the statfile=1 specs ask for hour stats, but we only store day stats... fake them... print int($dayreq/24)."\n"; print int($dayupd/24)."\n"; } else { print "NOTE: statfile not supported\n"; } statlog $day, "statfile"; } else { statlog $day, "badcommand"; error "unknown command"; } } sub dov2request { my ( %param ) = @_; my ( $client, $version ) = getclientandversion %param; my $network = 'gnutella'; ## NOTE: default net name - supplying no net= param is the same as sending this value $network = lc $1 if defined $param{net} and $param{net} =~ m/^([a-zA-Z0-9._-]{1,20})$/; # (NOTE: above restrictions are the same as I use for the client string - no particular reason for this) if ( 1 ) # always enforce client parameter for v2 requests { if ( $client =~ m/^unknown/ ) # just check for client { statlog $day, "v2_block_anon"; error "V2 Anonymous clients not allowed here"; } } # OK - the v2 specs say that a cache must always return something - returning nothing is an error # they also specify a response to a ping but not the query to make! # there's also the fact that unwanted info can always be returned # so... - let's always output our ping line... print "i|pong|perlgcache/$cacheversion\n"; my @actions; if ( defined $param{ping} ) { push @actions, "ping"; } if ( defined $param{update} ) { # limit update rate by remembering the cache... my %updateip = loadhashhash "$prefix-updateip"; foreach my $ip ( keys %updateip ) { if ( $updateip{$ip}{time} < $time - $updatelimit*60 ) { delete $updateip{$ip}; next; } if ( $ENV{REMOTE_ADDR} eq $ip ) { statlog $day, "update_block_ratelimit"; print "i|update|WARNING|You came back too soon\n"; exit; } } $updateip{$ENV{REMOTE_ADDR}}{time} = $time; savehashhash "$prefix-updateip", %updateip; if ( defined $param{ip} ) { my $ipport = $param{ip}; if ( $ipport =~ m{^(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}):(\d{1,5})$} ) { my ( $ip, $port ) = ( $1, $2 ); if ( $port < 1 or $port > 65535 ) { statlog $day, "v2_update_block_badport"; print "i|update|WARNING|Rejected IP\n"; print "d|update|ip|port $port out of range\n"; return; } my $iptype = getiptype $ip; # TODO: allow 'private' addresses in certain situations?? unless ( $iptype eq 'public' ) { statlog $day, "v2_update_block_badip"; print "i|update|WARNING|Rejected IP\n"; print "d|update|ip|ip $ip isn't public - $iptype\n"; } addhost $network, $ipport, $client; print "i|update|OK\n"; print "d|update|ip|ip=$ip:$port net=$network accepted\n"; } } if ( defined $param{url} ) { my $url = $param{url}; # bodge to change %7e to ~ $url =~ s{%7[eE]}{~}g; if ( $url =~ m{^http://[a-zA-Z0-9.-]{1,200}(:\d{1,5})?/[a-zA-Z0-9/.~_-]{0,250}$} ) { addurl 2, $url, $network; # add url to the gwebcache v2 list... print "i|update|OK\n"; print "d|update|url|url=$url added to gwc2 list\n"; } else { statlog $day, "v2_update_block_badurl"; print "i|update|WARNING|Rejected URL\n"; return; } } push @actions, "update"; } if ( $param{get} ) { # first, do the hosts list... my @hosts = gethosts $network; # @hosts = pickhosts $client, @hosts; foreach my $item ( @hosts ) { my $age = $time - ${$item}{time}; $age = 0 if $age < 0; print "h|${$item}{ipport}|$age|${$item}{client}\n"; } # now do the urls... my @urls = geturls 2, $network; @urls = pickurls @urls; foreach my $item ( @urls ) { my $age = $time - ${$item}{time}; $age = 0 if $age < 0; print "u|${$item}{url}|$age\n"; } push @actions, "get"; } if ( defined $param{support} ) { my @networks = getnetworks; foreach my $network ( @networks ) { print "i|SUPPORT|$network\n"; } print "i|SUPPORT|*\n"; # show we support any unknown network } statlog $day, "v2_".(join "_", sort @actions); } sub dorequest { # globals: read time once - to avoid possible race conditions $time = time; $day = int ( $time / (24*60*60) ); unless ( defined $ENV{REQUEST_METHOD} and $ENV{REQUEST_METHOD} eq 'GET' ) { print "Content-type: text/plain\n"; print "Cache-control: no-cache\n"; print "Connection: close\n"; print "\n"; error "only GET requests supported" } my %param = paramdecode $ENV{QUERY_STRING}; if ( scalar %param eq "0" or defined $param{browse} ) { dohtmlpage %param; exit; } print "Content-type: text/plain\n"; print "Cache-control: no-cache\n"; print "Connection: close\n"; print "\n"; # to avoid multiple paths to this script (eg .../perlgcache.cgi and .../perlgcache.cgi/) if ( defined $ENV{PATH_INFO} and $ENV{PATH_INFO} ne '' ) { statlog $day, "block_badpath"; error "invalid path to script (PATH_INFO = $ENV{PATH_INFO})"; } # block http/0.9 requests # (there's one anonymous client which just sends "GET http://host/path/script\r\n") if ( 1 ) { if ( defined $ENV{SERVER_PROTOCOL} and $ENV{SERVER_PROTOCOL} eq 'HTTP/0.9' ) { statlog $day, "block_http0.9"; error "HTTP/0.9 requests not supported. read RFC2616 or at least RFC1945"; } } if ( $requestlimitcount and $requestlimittime ) { my %requestip = loadhashhash "$prefix-requestip"; # first, remove old entries... foreach my $ip ( keys %requestip ) { if ( $requestip{$ip}{time} < $time - $requestlimittime*60 ) { delete $requestip{$ip}; next; } } # add a blank entry if required... unless ( defined $requestip{$ENV{REMOTE_ADDR}} ) { $requestip{$ENV{REMOTE_ADDR}}{count} = 0; $requestip{$ENV{REMOTE_ADDR}}{time} = $time; } # increment the counter... $requestip{$ENV{REMOTE_ADDR}}{count}++; # and save it back... savehashhash "$prefix-requestip", %requestip; if ( $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount ) { statlog $day, "block_requestrate"; # start adding delays to high request rate clients. the theory # is that even abusive clients still wait for a request to complete # before making another request sleep 5 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*2; sleep 10 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*3; sleep 15 if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*5; # if over 10 times the standard limit, then exit without a message (saves a few bytes) exit if $requestip{$ENV{REMOTE_ADDR}}{count} > $requestlimitcount*10; error "request rate of $requestlimitcount requests every $requestlimittime minutes exceeded for $ENV{REMOTE_ADDR}\n"; } } if ( $enablev2requests and ( !$enablev1requests or defined $param{get} or defined $param{update} or defined $param{net} or (defined $param{ping} and $param{ping} eq '2') ) ) { dov2request %param; } else { dov1request %param; } } # regular CGI dorequest; # NOTE: not working yet - there's several cases where the above code does an 'exit' instead # of a 'return', so it won't work well... ##FCGI### FastCGI ##FCGI##use FCGI; ##FCGI##my $fcgi = FCGI::Request(); ##FCGI##while ( $fcgi->Accept() >= 0 ) ##FCGI##{ ##FCGI## dorequest; ##FCGI## $fcgi->Finish(); ##FCGI## # exit if script file changes... ##FCGI## last if -M $ENV{SCRIPT_FILENAME} < 0; ##FCGI##}