#!/usr/local/bin/perl5.8.8 # LW2 version 2.1 # # LW2 copyright 2000-2004 by rfp.labs # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # =head1 NAME LW2 - Perl HTTP library version 2.1 =head1 SYNOPSIS use LW2; require 'LW2.pm'; =head1 DESCRIPTION Libwhisker is a Perl library useful for HTTP testing scripts. It contains a pure-Perl reimplementation of functionality found in the C, C, C, C, C, C, C, C, C, C, and C modules. Libwhisker is designed to be portable (a single perl file), fast (general benchmarks show libwhisker is faster than LWP), and flexible (great care was taken to ensure the library does exactly what you want to do, even if it means breaking the protocol). =head1 FUNCTIONS The following are the functions contained in Libwhisker: =over 4 =cut package LW2; $LW2::VERSION="2.1"; $PACKAGE='LW2'; BEGIN { package LW2; $PACKAGE='LW2'; ## LW module manager stuff ## %AVAILABLE = (); $LW_SSL_LIB = 0; $LW_NONBLOCK_CONNECT=0; eval "require MIME::Base64"; if(!$@){ $AVAILABLE{'mime::base64'}=$MIME::Base64::VERSION; 1 if($MIME::Base64::VERSION); } eval "require MD5"; if(!$@){ $AVAILABLE{'md5'}=$MD5::VERSION; 1 if($MD5::VERSION); } eval "use Socket"; if(!$@){ $AVAILABLE{'socket'}=$Socket::VERSION; eval "use Net::SSLeay"; # do we have SSL support? if(!$@){ $LW_SSL_LIB=1; $AVAILABLE{'net::ssleay'}=$Net::SSLeay::VERSION; 1 if($Net::SSLeay::VERSION); Net::SSLeay::load_error_strings(); Net::SSLeay::SSLeay_add_ssl_algorithms(); Net::SSLeay::randomize(); } else { eval "use Net::SSL"; if(!$@){ $LW_SSL_LIB=2; $AVAILABLE{'net::ssl'}=$Net::SSL::VERSION; 1 if($Net::SSL::VERSION); } } if($^O!~/Win32/){ eval "use POSIX qw(:errno_h :fcntl_h)"; if(!$@){ $LW_NONBLOCK_CONNECT=1; } } } } # BEGIN ######################################################################## =item B Params: $auth_method, \%req, $user, \@passwords [, $domain, $fail_code ] Return: $first_valid_password, undef if error/none found Perform a HTTP authentication brute force against a server (host and URI defined in %req). It will try every password in the password array for the given user. The first password (in conjunction with the given user) that doesn't return HTTP 401 is returned (and the brute force is stopped at that point). You should retry the request with the given password and double-check that you got a useful HTTP return code that indicates successful authentication (200, 302), and not something a bit more abnormal (407, 500, etc). $domain is optional, and is only used for NTLM auth. Note: set up any proxy settings and proxy auth in %req before calling this function. You can brute-force proxy authentication by setting up the target proxy as proxy_host and proxy_port in %req, using an arbitrary host and uri (preferably one that is reachable upon successful proxy authorization), and setting the $fail_code to 407. The $auth_method passed to this function should be a proxy-based one ('proxy-basic', 'proxy-ntlm', etc). if your server returns something other than 401 upon auth failure, then set $fail_code to whatever is returned (and it needs to be something *different* than what is received on auth success, or this function won't be able to tell the difference). =cut sub auth_brute_force { my ($auth_method, $hrin, $user, $pwordref, $dom, $fail_code)=@_; my ($P,%hout); $fail_code||=401; return undef if(!defined $auth_method || length($auth_method)==0); return undef if(!defined $user || length($user) ==0); return undef if(!(defined $hrin && ref($hrin) )); return undef if(!(defined $pwordref && ref($pwordref))); map { ($P=$_)=~tr/\r\n//d; auth_set_header($auth_method,$hrin,$user,$P,$dom); return undef if(http_do_request($hrin,\%hout)); return $P if($hout{whisker}->{code} != $fail_code); } @$pwordref; return undef;} ######################################################################## =item B Params: \%req Return: nothing (modifies %req) Modifes %req to disable all authentication (regular and proxy). Note: it only removes the values set by auth_set(). Manually-defined [Proxy-]Authorization headers will also be deleted (but you shouldn't be using the auth_* functions if you're manually handling your own auth...) =cut sub auth_unset { my $href=shift; return if(!defined $href || !ref($href)); delete $$href{Authorization}; delete $$href{'Proxy-Authorization'}; delete $$href{whisker}->{auth_callback}; delete $$href{whisker}->{auth_proxy_callback}; delete $$href{whisker}->{auth_data}; delete $$href{whisker}->{auth_proxy_data}; } ######################################################################## =item B Params: $auth_method, \%req, $user, $password [, $domain] Return: nothing (modifies %req) Modifes %req to use the indicated authentication info. Auth_method can be: 'basic', 'proxy-basic', 'ntlm', 'proxy-ntlm'. Note: this function may not necessarily set any headers after being called. =cut sub auth_set { my ($method, $href, $user, $pass, $domain)=(lc(shift),@_); return if(!(defined $href && ref($href))); return if(!defined $user || !defined $pass); if($method eq 'basic'){ $$href{'Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'proxy-basic'){ $$href{'Proxy-Authorization'}='Basic '.encode_base64($user.':'.$pass,''); } if($method eq 'ntlm'){ http_close($href); $$href{whisker}->{auth_data}=ntlm_new($user,$pass,$domain); $$href{whisker}->{auth_callback}=\&_ntlm_auth_callback; } if($method eq 'proxy-ntlm'){ die("Libwhisker error: proxy-ntlm auth w/ SSL not currently supported") if($href->{whisker}->{ssl}>0); http_close($href); $$href{whisker}->{auth_proxy_data}=ntlm_new($user,$pass,$domain); $$href{whisker}->{auth_proxy_callback}=\&_ntlm_auth_proxy_callback; } } =item B Cookies are stored in a "jar" (hash), indexed by cookie name. The contents are an anonymous array: $jar{'name'}=[ 'value', 'domain', 'path', 'expire', 'secure' ] $jar{'name'}=[ 'value', { ..parameters.. }, version ] =cut ######################################################################## =item B Params: \%jar, \%response Return: $num_of_cookies_read Read in cookies from an %response hash, and put them in %jar. Notice: cookie_read uses internal magic done by http_do_request in order to read cookies regardless of 'Set-Cookie' header appearance. =cut sub cookie_read { my ($count,$jarref,$href)=(0,@_); return 0 if(!(defined $jarref && ref($jarref))); return 0 if(!(defined $href && ref($href) )); return 0 if(!(defined $$href{whisker}->{cookies} && ref($$href{whisker}->{cookies}))); foreach (@{$href->{whisker}->{cookies}}){ cookie_parse($jarref,$_); $count++; } return $count; } ######################################################################## =item B Params: \%jar, $cookie Return: nothing Parses the cookie into the various parts and then sets the appropriate values in the %jar under the name; if the cookie is blank, it will delete it from the jar. =cut sub cookie_parse { my ($jarref, $header)=@_; my ($del,$part,@parts,@construct,$cookie_name)=(0); return if(!(defined $jarref && ref($jarref))); return if(!(defined $header && length($header)>0)); @parts=split(/;/,$header); foreach $part (@parts){ if($part=~/^[ \t]*(.+?)=(.*)$/){ my ($name,$val)=($1,$2); if($name=~/^domain$/i){ $val=~s#^http://##; $val=~s#/.*$##; $construct[1]=$val; } elsif($name=~/^path$/i){ $val=~s#/$## if($val ne '/'); $construct[2]=$val; } elsif($name=~/^expires$/i){ $construct[3]=$val; } else { $cookie_name=$name; if($val eq ''){ $del=1; } else { $construct[0]=$val;} } } else { if($part=~/secure/){ $construct[4]=1;} } } if($del){ delete $$jarref{$cookie_name} if defined $$jarref{$cookie_name}; } else { $$jarref{$cookie_name}=\@construct; } } ######################################################################## =item B Params: \%jar, \%request, $override Return: nothing Goes through the given jar and sets the Cookie header in %req pending the correct domain and path. If $override is true, then the domain and path restrictions of the cookies are ignored. Todo: factor in expire and secure. =cut sub cookie_write { my ($jarref, $hin, $override)=@_; my ($name,$out)=('',''); return if(!(defined $jarref && ref($jarref))); return if(!(defined $hin && ref($hin) )); $override=$override||0; $$hin{'whisker'}->{'ssl'}=$$hin{'whisker'}->{'ssl'}||0; foreach $name (keys %$jarref){ next if($name eq ''); next if($$hin{'whisker'}->{'ssl'}==0 && $$jarref{$name}->[4]>0); if($override || ($$hin{'whisker'}->{'host'}=~/$$jarref{$name}->[1]$/i && $$hin{'whisker'}->{'uri'}=~/$$jarref{$name}->[2]/i)){ $out.="$name=$$jarref{$name}->[0];"; } } if($out ne ''){ $$hin{'Cookie'}=$out; } } ######################################################################## =item B Params: \%jar, $name Return: @elements Fetch the named cookie from the jar, and return the components. =cut sub cookie_get { my ($jarref,$name)=@_; return undef if(!(defined $jarref && ref($jarref))); if(defined $$jarref{$name}){ return @{$$jarref{$name}};} return undef; } ######################################################################## =item B Params: \%jar, $name, $value, $domain, $path, $expire, $secure Return: nothing Set the named cookie with the provided values into the %jar. =cut sub cookie_set { my ($jarref,$name,$value,$domain,$path,$expire,$secure)=@_; my @construct; return if(!(defined $jarref && ref($jarref))); return if($name eq ''); if($value eq ''){ delete $$jarref{$name}; return;} $path=$path||'/'; $secure=$secure||0; @construct=($value,$domain,$path,$expire,$secure); $$jarref{$name}=\@construct; } ######################################################################## ##################################################### # cluster global variables %_crawl_config=('save_cookies' => 0, 'reuse_cookies' => 1, 'save_offsites' => 0, 'save_non_http' => 0, 'follow_moves' => 1, 'url_limit' => 1000, 'use_params' => 0, 'params_double_record' => 0, 'skip_ext' => { gif=>1, jpg=>1, png=>1, gz=>1, swf=>1, pdf=>1, zip=>1, wav=>1, mp3=>1, asf=>1, tgz=>1 }, 'save_skipped' => 0, 'save_referrers'=> 0, 'do_head' => 0, 'callback' => 0, 'netloc_bug' => 1, 'normalize_uri' => 1, 'source_callback' => 0 ); %_crawl_linktags = ( 'a' => 'href', 'applet' => [qw(codebase archive code)], 'area' => 'href', 'base' => 'href', 'bgsound' => 'src', 'blockquote' => 'cite', 'body' => 'background', 'del' => 'cite', 'embed' => [qw(src pluginspage)], 'form' => 'action', 'frame' => [qw(src longdesc)], 'iframe' => [qw(src longdesc)], 'ilayer' => 'background', 'img' => [qw(src lowsrc longdesc usemap)], 'input' => [qw(src usemap)], 'ins' => 'cite', 'isindex' => 'action', 'head' => 'profile', 'layer' => [qw(background src)], 'link' => 'href', # 'meta' => 'http-equiv', 'object' => [qw(codebase data archive usemap)], 'q' => 'cite', 'script' => 'src', 'table' => 'background', 'td' => 'background', 'th' => 'background', 'xmp' => 'href', ); ##################################################### =item B Params: $START, $MAX_DEPTH, \%request_hash [, \%tracking_hash ] Return: $crawl_object The crawl_new() functions initializes a crawl object (hash) to the default values, and then returns it for later use by crawl(). $START is the starting URL (in the form of 'http://www.host.com/url'), and MAX_DEPTH is the maximum number of levels to crawl (the START URL counts as 1, so a value of 2 will crawl the START URL and all URLs found on that page). The request_hash is a standard initialized request hash to be used for requests; you should set any authentication information or headers in this hash in order for the crawler to use them. The optional tracking_hash lets you supply a hash for use in tracking URL results (otherwise crawl_new() will allocate a new anon hash). =cut sub crawl_new { my ($start, $depth, $reqref, $trackref)=@_; my %X; return undef if(!defined $start || !defined $depth); return undef if(!defined $reqref || !ref($reqref)); $trackref={} if(!defined $trackref || !ref($trackref)); $X{track} =$trackref; $X{request} =$reqref; $X{depth} =$depth||2; $X{start} =$start; $X{magic} =7340; $X{reset}= sub { $X{errors} =[]; # all errors encountered $X{urls} =[]; # temp; used to hold all URLs on page $X{server_tags} ={}; # all server tags found $X{referrers} ={}; # who refers to what URLs $X{offsites} ={}; # all URLs that point offsite $X{response} ={}; # temp; the response hash $X{non_http} ={}; # all non_http URLs found $X{cookies} ={}; # all cookies found $X{forms} ={}; # all forms found $X{jar} ={}; # temp; cookie jar $X{config} ={}; %{ $X{config} } = %_crawl_config; %{ $X{track} } =(); $X{parsed_page_count} =0; }; $X{crawl}= sub { crawl(\%X, @_) }; $X{reset}->(); return \%X; } ##################################################### =item B Params: $crawl_object [, $START, $MAX_DEPTH ] Return: $count [ undef on error ] The heart of the crawl package. Will perform an HTTP crawl on the specified HOST, starting at START URI, proceeding up to MAX_DEPTH. Crawl_object needs to be the variable returned by crawl_new(). You can also indirectly call crawl() via the crawl_object itself: $crawl_object->{crawl}->($START,$MAX_DEPTH) Returns the number of URLs actually crawled (not including those skipped). =cut { # START OF CRAWL CONTAINER sub crawl { my ($C, $START, $MAX_DEPTH)=@_; return undef if(!defined $C || !ref($C) || $C->{magic}!=7340); # shortcuts, to reduce dereferences and typing my $CONFIG = $C->{config}; my $TRACK = $C->{track}; my $URLS = $C->{urls}; my $RESP = $C->{response}; my $REQ = $C->{request}; $START ||= $C->{start}; $C->{depth} = $MAX_DEPTH || $C->{depth}; my ($COUNT, $T, @ST, @url_queue)=(0, ''); # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD ] my @v=uri_split($START); my $error = undef; $error = 'Start protocol not http or https' if($v[1] ne 'http' && $v[1] ne 'https'); $error = 'Bad start host' if(!defined $v[2] || $v[2] eq ''); push ( @{ $C->{errors} }, $error ) && return undef if(defined $error); @ST = ( $v[2], $v[3], $v[0], 1, '' ); $REQ->{whisker}->{ssl} = 1 if($v[1] eq 'https'); $REQ->{whisker}->{host}=$ST[0]; $REQ->{whisker}->{port}=$ST[1]; $REQ->{whisker}->{lowercase_incoming_headers}=1; $REQ->{whisker}->{ignore_duplicate_headers} =0; delete $REQ->{whisker}->{parameters}; http_fixup_request($REQ); push @url_queue, \@ST; while(@url_queue){ @ST = @{ shift @url_queue }; next if(defined $TRACK->{$ST[2]} && $TRACK->{$ST[2]} ne '?'); if($ST[3] > $C->{depth}){ $TRACK->{$ST[2]}='?' if($CONFIG->{save_skipped}>0); next; } $ST[4]=uri_get_dir($ST[2]); $REQ->{whisker}->{uri}=$ST[2]; my $result = _crawl_do_request($REQ,$RESP,$C); if($result==1 || $result==2){ push @{ $C->{errors} }, "$ST[2]: $RESP->{whisker}->{error}"; next; } $COUNT++; $TRACK->{$ST[2]}=$RESP->{whisker}->{code} if($result==0 || $result==4); $TRACK->{$ST[2]}='?' if( ($result==3||$result==5) && $CONFIG->{save_skipped}>0); if(defined $RESP->{server} && !ref($RESP->{server})){ $C->{server_tags}->{ $RESP->{server} }++; } if(defined $RESP->{'set-cookie'}){ if($CONFIG->{save_cookies}>0){ if(ref($RESP->{'set-cookie'})){ $C->{cookies}->{$_}++ foreach (@{$RESP->{'set-cookie'}}); } else { $C->{cookies}->{$RESP->{'set-cookie'}}++; } } cookie_read($C->{jar}, $RESP) if($CONFIG->{reuse_cookies}>0); } next if($result==4 || $result==5); next if(scalar @url_queue > $CONFIG->{url_limit}); if($result==0){ # page should be parsed if($CONFIG->{source_callback} != 0 && ref($CONFIG->{source_callback}) eq 'CODE'){ &{$CONFIG->{source_callback}}($C); } html_find_tags(\$RESP->{whisker}->{data}, \&_crawl_extract_links_test,0,$C,\%_crawl_linktags); $C->{parsed_page_count}++; } push @$URLS, $RESP->{location} if($result==3); foreach $T (@$URLS){ $T=~tr/\0\r\n//d; next if (length($T)==0); next if ($T=~/^#/i); # fragment push @{$C->{referrers}->{$T}}, $ST[2] if($CONFIG->{save_referrers}>0); if($T=~/^([a-zA-Z0-9]*):/ && lc($1) ne 'http' && lc($1) ne 'https'){ push @{$C->{non_http}->{$T}}, $ST[2] if($CONFIG->{save_non_http}>0); next; } if( substr($T,0,2) eq '//' && $CONFIG->{netloc_bug}>0 ){ if( $REQ->{whisker}->{ssl}>0 ){ $T='https:'.$T; } else { $T='http:'.$T; } } if($CONFIG->{callback} != 0){ next if &{$CONFIG->{callback}}($T,$C); } $T=uri_absolute( $T, $ST[4], $CONFIG->{normalize_uri} ); # (uri,protocol,host,port,params,frag,user,pass) @v=uri_split($T); # make sure URL is on same host and port if( (defined $v[2] && $v[2] ne $ST[0]) || ($v[3]>0 && $v[3] != $ST[1])){ $C->{offsites}->{uri_join(@v)}++ if($CONFIG->{save_offsites}>0); next; } next if(defined $TRACK->{$v[0]}); # we've processed this already if($v[0]=~/\.([a-z0-9]+)$/i){ if(defined $CONFIG->{skip_ext}->{lc($1)}){ $TRACK->{$v[0]}='?' if($CONFIG->{save_skipped}>0); next; } } if(defined $v[4] && $CONFIG->{use_params}>0){ $TRACK->{$v[0]}='?' if($CONFIG->{params_double_record}>0 && !defined $TRACK->{$v[0]}); $v[0]=$v[0].'?'.$v[4]; } # ST[] = [ 0.HOST, 1.PORT, 2.URL, 3.DEPTH, 4.CWD ] push @url_queue, [ $ST[0], $ST[1], $v[0], $ST[3]+1, '' ]; } # foreach @$URLS=(); # reset for next round } # while return $COUNT; } # end sub crawl ##################################################### sub _crawl_extract_links_test { my ($TAG, $hr, $dr, $start, $len, $OBJ)=(lc(shift),@_); return undef if(!scalar %$hr); # fastpath quickie # we know this is defined, due to our tagmap my $t=$_crawl_linktags{$TAG}; while( my ($key,$val)= each %$hr){ # normalize element values $$hr{lc($key)} = $val; } # all of this just to catch meta refresh URLs if($TAG eq 'meta' && defined $$hr{'http-equiv'} && $$hr{'http-equiv'} eq 'refresh' && defined $$hr{'content'} && $$hr{'content'}=~m/url=(.+)/i ){ push(@{$OBJ->{urls}},$1); } elsif(ref($t)){ foreach (@$t){ push(@{$OBJ->{urls}},$$hr{$_}) if(defined $$hr{$_}); } } else { push(@{$OBJ->{urls}},$$hr{$t}) if(defined $$hr{$t}); } if($TAG eq 'form' && defined $$hr{action}){ my $u=$OBJ->{response}->{whisker}->{uri}; $OBJ->{forms}->{ uri_absolute($$hr{action},$u,1) }++; } return undef; } ################################################################ sub _crawl_do_request_ex { my ($hrin,$hrout,$OBJ)=@_; my $ret; $ret=http_do_request($hrin,$hrout); return (2,$ret) if($ret==2); # if there was connection error, do not continue if($ret==0){ # successful request # WARNING: what if *all* HEAD respones are 302'd on purpose, but # all GETs are normal? if($$hrout{whisker}->{code} <308 && $$hrout{whisker}->{code} >300){ if($OBJ->{config}->{follow_moves} >0){ return (3,$ret) if(defined $$hrout{location} && !ref($$hrout{location})); } return (5,$ret); # not avail } if($$hrout{whisker}->{code}==200){ # no content-type is treated as text/htm if(defined $$hrout{'content-type'} && $$hrout{'content-type'}!~/^text\/htm/i){ return (4,$ret); } } } return (-1,$ret); # fallthrough } ################################################################ sub _crawl_do_request { my ($hrin,$hrout,$OBJ) = @_; my ($cret,$lwret); if($OBJ->{config}->{do_head} && $$hrin{whisker}->{method} ne 'HEAD'){ my $save=$$hrin{whisker}->{method}; $$hrin{whisker}->{method}='HEAD'; ($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ); $$hrin{whisker}->{method}=$save; return $cret if($cret > 0); if($lwret==0){ # successful request if($$hrout{whisker}->{code}==501){ # HEAD not allowed $OBJ->{config}->{do_head}=0; # no more HEAD requests } } # request errors are essentially redone via GET, below } ($cret,$lwret)=_crawl_do_request_ex($hrin,$hrout,$OBJ); return $lwret if($cret < 0); return $cret; } } # CRAWL_CONTAINER ################################################################ ######################################################################## =item B Params: $name, \@array [, $name, \%hash, $name, \$scalar ] Return: $code [ undef on error ] The dump function will take the given $name and data reference, and will create an ASCII perl code representation suitable for eval'ing later to recreate the same structure. $name is the name of the variable that it will be saved as. Example: $output = LW2::dump('request',\%request); NOTE: dump() creates anonymous structures under the name given. For example, if you dump the hash %hin under the name 'hin', then when you eval the dumped code you will need to use %$hin, since $hin is now a *reference* to a hash. =cut sub dump { my %what=@_; my ($final,$k,$v)=(''); while( ($k,$v)=each %what){ return undef if(ref($k) || !ref($v)); $final.="\$$k = "._dump(1,$v,1); $final=~s#,\n$##; $final.=";\n"; } return $final; } ######################################################################## =item B Params: $file, $name, \@array [, $name, \%hash, $name, \@scalar ] Return: 0 if success; 1 if error This calls dump() and saves the output to the specified $file. Note: LW does not checking on the validity of the file name, it's creation, or anything of the sort. Files are opened in overwrite mode. =cut sub dump_writefile { my $file=shift; my $output=&dump(@_); return 1 if(!open(OUT,">$file") || !defined $output); print OUT $output; close(OUT); } ######################################################################## sub _dump { # dereference and dump an element my ($t, $ref, $depth)=@_; my ($out,$k,$v)=(''); $depth||=1; # to protect against circular loops return 'undef' if($depth > 128); if(!defined $ref){ return 'undef'; } elsif(ref($ref) eq 'HASH'){ $out.="{\n"; while( ($k,$v)=each %$ref){ next if($k eq ''); $out.= "\t"x$t; $out.=_dumpd($k).' => '; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); $out.="},\n"; } elsif(ref($ref) eq 'ARRAY'){ $out.="["; if(~~@$ref){ $out.="\n"; foreach $v (@$ref) { $out.= "\t"x$t; if(ref($v)){ $out.=_dump($t+1,$v,$depth+1); } else { $out.=_dumpd($v); } $out.=",\n" unless( substr($out,-2,2) eq ",\n"); } $out=~s#,\n$#\n#; $out.="\t"x($t-1); } $out.="],\n"; } elsif(ref($ref) eq 'SCALAR'){ $out.=_dumpd($$ref); } elsif(ref($ref) eq 'REF'){ $out.=_dump($t,$$ref,$depth+1); } elsif(ref($ref)){ # unknown/unsupported ref $out.="undef"; } else { # normal scalar $out.=_dumpd($ref); } return $out; } ######################################################################## sub _dumpd { # escape a scalar string my $v=shift; return 'undef' if(!defined $v); return "''" if($v eq ''); return "$v" if($v eq '0' || $v!~tr/0-9//c && $v!~m#^0+# ); if($v!~tr/ !-~//c){ $v=~s/(['\\])/\\$1/g; return "'$v'"; } $v=~s#\\#\\\\#g; $v=~s#"#\\"#g; $v=~s#\r#\\r#g; $v=~s#\n#\\n#g; $v=~s#\0#\\0#g; $v=~s#\t#\\t#g; $v=~s#\$#\\\$#g; $v=~s#([^!-~ ])#sprintf('\\x%02x',ord($1))#eg; return "\"$v\""; } ######################################################################## ######################################################################## =item B Params: $data [, $eol] Return: $b64_encoded_data This function does Base64 encoding. If the binary MIME::Base64 module is available, it will use that; otherwise, it falls back to an internal perl version. The perl version carries the following copyright: Copyright 1995-1999 Gisle Aas NOTE: the $eol parameter will be inserted every 76 characters. This is used to format the data for output on a 80 character wide terminal. =cut sub encode_base64 { goto &MIME::Base64::encode_base64 if($AVAILABLE{'mime::base64'}); my $res = ""; my $eol = $_[1]; $eol = "\n" unless defined $eol; pos($_[0]) = 0; while ($_[0] =~ /(.{1,45})/gs) { $res .= substr(pack('u', $1), 1); chop($res);} $res =~ tr|` -_|AA-Za-z0-9+/|; my $padding = (3 - length($_[0]) % 3) % 3; $res =~ s/.{$padding}$/'=' x $padding/e if $padding; if (length $eol) { $res =~ s/(.{1,76})/$1$eol/g; } $res; } ######################################################################## =item B Params: $data Return: $b64_decoded_data A perl implementation of base64 decoding. The perl code for this function was actually taken from an older MIME::Base64 perl module, and bears the following copyright: Copyright 1995-1999 Gisle Aas =cut sub decode_base64 { # ripped from MIME::Base64 goto &MIME::Base64::decode_base64 if($AVAILABLE{'mime::base64'}); my $str = shift; my $res = ""; $str =~ tr|A-Za-z0-9+=/||cd; $str =~ s/=+$//; # remove padding $str =~ tr|A-Za-z0-9+/| -_|; # convert to uuencoded format while ($str =~ /(.{1,60})/gs) { my $len = chr(32 + length($1)*3/4); # compute length byte $res .= unpack("u", $len . $1 ); # uudecode }$res;} ######################################################################## =item B Params: $data Return: $result This function encodes every character (except the / character) with normal URL hex encoding. =cut sub encode_uri_hex { # normal hex encoding my $str=shift; $str=~s/([^\/])/sprintf("%%%02x",ord($1))/ge; return $str;} ######################################################################### =item B Params: $data Return: $result This function randomly encodes characters (except the / character) with normal URL hex encoding. =cut sub encode_uri_randomhex { # random normal hex encoding my @T=split(//,shift); my $s; foreach (@T) { if(m#[;=:&@\?]#){ $s.=$_; next; } if((rand()*2)%2 == 1){ $s.=sprintf("%%%02x",ord($_)) ; } else{ $s.=$_; } } return $s; } ######################################################################### =item B Params: $data Return: $result This function randomly changes the case of characters in the string. =cut sub encode_uri_randomcase { my ($x,$uri)=('',shift); return $uri if($uri!~tr/a-zA-Z//); # fast-path my @T=split(//,$uri); for($x=0;$x<(scalar @T);$x++){ if((rand()*2)%2 == 1){ $T[$x]=~tr/A-Za-z/a-zA-Z/; }} return join('',@T); } ######################################################################### =item B Params: $data Return: $result This function converts a normal string into Windows unicode format (non-overlong or anything fancy). =cut sub encode_unicode { my ($c,$r)=('',''); foreach $c (split(//,shift)){ $r.=pack("v",ord($c)); } return $r; } ######################################################################### =item B Params: $unicode_string Return: $decoded_string This function attempts to decode a unicode (UTF-8) string by converting it into a single-byte-character string. Overlong characters are converted to their standard characters in place; non-overlong (aka multi-byte) characters are substituted with the 0xff; invalid encoding characters are left as-is. Note: this function is useful for dealing with the various unicode exploits/vulnerabilities found in web servers; it is *not* good for doing actual UTF-8 parsing, since characters over a single byte are basically dropped/replaced with a placeholder. =cut sub decode_unicode { my $str = $_[0]; return $str if($str!~tr/!-~//c); # fastpath my ($lead,$count,$idx); my $out=''; my $len = length($str); my ($ptr,$no,$nu)=(0,0,0); while($ptr < $len){ my $c=substr($str,$ptr,1); if( ord($c) >= 0xc0 && ord($c) <= 0xfd){ $count=0; $c=ord($c)<<1; while( ($c & 0x80) == 0x80){ $c<<=1; last if($count++ ==4); } $c = ($c & 0xff); for( $idx=1; $idx<$count; $idx++){ my $o=ord(substr($str,$ptr+$idx,1)); $no=1 if($o != 0x80); $nu=1 if($o <0x80 || $o >0xbf); } my $o=ord(substr($str,$ptr+$idx,1)); $nu=1 if( $o < 0x80 || $o > 0xbf); if($nu){ $out.=substr($str,$ptr++,1); } else { if($no){ $out.="\xff"; # generic replacement char } else { my $prior=ord(substr($str,$ptr+$count-1,1))<<6; $out.= pack("C", (ord(substr($str,$ptr+$count,1) )&0x7f)+$prior); } $ptr += $count+1; } $no=$nu=0; } else { $out.=$c; $ptr++; } } return $out; } ######################################################################## =item B Params: \%request, $modes Return: nothing encode_anti_ids computes the proper anti-ids encoding/tricks specified by $modes, and sets up %hin in order to use those tricks. Valid modes are (the mode numbers are the same as those found in whisker 1.4): =over 4 =item 1 Encode some of the characters via normal URL encoding =item 2 Insert directory self-references (/./) =item 3 Premature URL ending (make it appear the request line is done) =item 4 Prepend a long random string in the form of "/string/../URL" =item 5 Add a fake URL parameter =item 6 Use a tab instead of a space as a request spacer =item 7 Change the case of the URL (works against Windows and Novell) =item 8 Change normal seperators ('/') to Windows version ('\') =item 9 Session splicing [NOTE: not currently available] =back You can set multiple modes by setting the string to contain all the modes desired; i.e. $modes="146" will use modes 1, 4, and 6. =cut sub encode_anti_ids { my ($rhin,$modes)=(shift,shift); my (@T,$x,$c,$s,$y); my $ENCODED=0; my $W = $$rhin{'whisker'}; return if(!(defined $rhin && ref($rhin))); # in case they didn't do it already $$rhin{'whisker'}->{'uri_orig'}=$$rhin{'whisker'}->{'uri'}; # note: order is important! # mode 9 - session splicing #if($modes=~/9/){ # $$rhin{'whisker'}->{'ids_session_splice'}=1; #} # mode 4 - prepend long random string if($modes=~/4/){$s=''; if($$W{'uri'}=~m#^/#){ $y=&utils_randstr; $s.=$y while(length($s)<512); $$W{'uri'}="/$s/..".$$W{'uri'}; } } # mode 7 - (windows) random case sensitivity if($modes=~/7/){ $$W{'uri'}=encode_uri_randomcase($$W{'uri'}); } # mode 2 - directory self-reference (/./) if($modes=~/2/){ $$W{'uri'}=~s#/#/./#g; } # mode 8 - windows directory separator (\) if($modes=~/8/){ $$W{'uri'}=~s#/#\\#g; $$W{'uri'}=~s#^\\#/#; $$W{'uri'}=~s#^([a-zA-Z0-9_]+):\\#$1://#; $$W{'uri'}=~s#\\$#/#; } # mode 1 - random URI (non-UTF8) encoding if($modes=~/1/){ if($ENCODED==0){ $$W{'uri'}=encode_uri_randomhex($$W{'uri'}); $ENCODED=1;} } # mode 5 - fake parameter if($modes=~/5/){ ($s,$y)=(&utils_randstr,&utils_randstr); $$W{'uri'}="/$s.html%3F$y=/../$$W{'uri'}"; } # mode 3 - premature URL ending if($modes=~/3/){ $s=&utils_randstr; $$W{'uri'}="/%20HTTP/1.1%0d%0aAccept%3a%20$s/../..$$W{'uri'}"; } # mode 6 - TAB as request spacer if($modes=~/6/){ $$W{'http_space1'}="\t"; } } =item B The goal is to parse the variable, human-readable HTML into concrete structures useable by your program. The forms functions does do a good job at making these structures, but I will admit: they are not exactly simple, and thus not a cinch to work with. But then again, representing something as complex as a HTML form is not a simple thing either. I think the results are acceptable for what's trying to be done. Anyways... Forms are stored in perl hashes, with elements in the following format: $form{'element_name'}=@([ 'type', 'value', @params ]) Thus every element in the hash is an array of anonymous arrays. The first array value contains the element type (which is 'select', 'textarea', 'button', or an 'input' value of the form 'input-text', 'input-hidden', 'input-radio', etc). The second value is the value, if applicable (it could be undef if no value was specified). Note that select elements will always have an undef value--the actual values are in the subsequent options elements. The third value, if defined, is an anonymous array of additional tag parameters found in the element (like 'onchange="blah"', 'size="20"', 'maxlength="40"', 'selected', etc). The array does contain one special element, which is stored in the hash under a NULL character ("\0") key. This element is of the format: $form{"\0"}=['name', 'method', 'action', @parameters]; The element is an anonymous array that contains strings of the form's name, method, and action (values can be undef), and a @parameters array similar to that found in normal elements (above). Accessing individual values stored in the form hash becomes a test of your perl referencing skills. Hint: to access the 'value' of the third element named 'choices', you would need to do: $form{'choices'}->[2]->[1]; The '[2]' is the third element (normal array starts with 0), and the actual value is '[1]' (the type is '[0]', and the parameter array is '[2]'). =cut ################################################################ # Cluster global variables %_forms_ELEMENTS=( 'form'=>1, 'input'=>1, 'textarea'=>1, 'button'=>1, 'select'=>1, 'option'=>1, '/select'=>1 ); ################################################################ =item B Params: \$html_data Return: @found_forms This function parses the given $html_data into libwhisker form hashes. It returns an array of hash references to the found forms. =cut sub forms_read { my $dr=shift; return undef if(!ref($dr) || length($$dr)==0); my $A = [ {}, [] ]; html_find_tags($dr,\&_forms_parse_callback,0, $A ,\%_forms_ELEMENTS); if(scalar %{$A->[0]}){ push(@{$A->[1]},$A->[0]); } return $A->[1]; } ################################################################ =item B Params: \%form_hash Return: $html_of_form [undef on error] This function will take the given %form hash and compose a generic HTML representation of it, formatted with tabs and newlines in order to make it neat and tidy for printing. Note: this function does *not* escape any special characters that were embedded in the element values. =cut sub forms_write { my $hr=shift; return undef if(!ref($hr) || !(scalar %$hr)); return undef if(!defined $$hr{"\0"}); my $t='
[0].'" method="'; $t.=$$hr{"\0"}->[1].'" action="'.$$hr{"\0"}->[2].'"'; if(defined $$hr{"\0"}->[3]){ $t.=' '.join(' ',@{$$hr{"\0"}->[3]}); } $t.=">\n"; while( my($name,$ar)=each(%$hr) ){ next if($name eq "\0"); foreach $a (@$ar){ my $P=''; $P=' '.join(' ', @{$$a[2]}) if(defined $$a[2]); $t.="\t"; if($$a[0] eq 'textarea'){ $t.="\n"; } elsif($$a[0]=~m/^input-(.+)$/){ $t.="\n"; } elsif($$a[0] eq 'option'){ $t.="\t