root/feedmelinks/lib/LWPx/ParanoidAgent.pm

Revision 1448, 19.0 kB (checked in by jm3, 2 years ago)

new spammer blacklist, perl libs, and better splinker hunting tools

Line 
1 package LWPx::ParanoidAgent;
2 require LWP::UserAgent;
3
4 use vars qw(@ISA $VERSION);
5 @ISA = qw(LWP::UserAgent);
6 $VERSION = '1.03';
7
8 require HTTP::Request;
9 require HTTP::Response;
10
11 use HTTP::Status ();
12 use strict;
13 use Net::DNS;
14
15 sub new {
16     my $class = shift;
17     my %opts = @_;
18
19     my $blocked_hosts     = delete $opts{blocked_hosts}     || [];
20     my $whitelisted_hosts = delete $opts{whitelisted_hosts} || [];
21     my $resolver          = delete $opts{resolver};
22     my $paranoid_proxy    = delete $opts{paranoid_proxy};
23     $opts{timeout}      ||= 15;
24
25     my $self = LWP::UserAgent->new( %opts );
26
27     $self->{'blocked_hosts'}     = $blocked_hosts;
28     $self->{'whitelisted_hosts'} = $whitelisted_hosts;
29     $self->{'resolver'}          = $resolver;
30     $self->{'paranoid_proxy'}    = $paranoid_proxy;
31
32     $self = bless $self, $class;
33     return $self;
34 }
35
36 # returns seconds remaining given a request
37 sub _time_remain {
38     my $self = shift;
39     my $req = shift;
40
41     my $now = time();
42     my $start_time = $req->{_time_begin} || $now;
43     return $start_time + $self->{timeout} - $now;
44 }
45
46 sub _resolve {
47     my ($self, $host, $request, $timeout, $depth) = @_;
48     my $res = $self->resolver;
49     $depth ||= 0;
50
51     die "CNAME recursion depth limit exceeded.\n" if $depth > 10;
52     die "Suspicious results from DNS lookup" if $self->_bad_host($host);
53
54     # return the IP address if it looks like one and wasn't marked bad
55     return ($host) if $host =~ /^\d+\.\d+\.\d+\.\d+$/;
56
57     my $sock = $res->bgsend($host)
58         or die "No sock from bgsend";
59
60     my $rin = '';
61     vec($rin, fileno($sock), 1) = 1;
62     my $nf = select($rin, undef, undef, $self->_time_remain($request));
63     die "DNS lookup timeout" unless $nf;
64
65     my $packet = $res->bgread($sock)
66         or die "DNS bgread failure";
67     $sock = undef;
68
69     my @addr;
70     my $cname;
71     foreach my $rr ($packet->answer) {
72         if ($rr->type eq "A") {
73             die "Suspicious DNS results from A record\n" if $self->_bad_host($rr->address);
74             # untaints the address:
75             push @addr, join(".", ($rr->address =~ /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/));
76         } elsif ($rr->type eq "CNAME") {
77             # will be checked for validity in the recursion path
78             $cname = $rr->cname;
79         }
80     }
81
82     return @addr if @addr;
83     return () unless $cname;
84     return $self->_resolve($cname, $request, $timeout, $depth + 1);
85 }
86
87 sub _host_list_match {
88     my $self = shift;
89     my $list_name = shift;
90     my $host = shift;
91
92     foreach my $rule (@{ $self->{$list_name} }) {
93         if (ref $rule eq "CODE") {
94             return 1 if $rule->($host);
95         } elsif (ref $rule) {
96             # assume regexp
97             return 1 if $host =~ /$rule/;
98         } else {
99             return 1 if $host eq $rule;
100         }
101     }
102 }
103
104 sub _bad_host {
105     my $self = shift;
106     my $host = lc(shift);
107
108     return 0 if $self->_host_list_match("whitelisted_hosts", $host);
109     return 1 if $self->_host_list_match("blocked_hosts", $host);
110     return 1 if
111         $host =~ /^localhost$/i ||    # localhost is bad.  even though it'd be stopped in
112                                       #    a later call to _bad_host with the IP address
113         $host =~ /\s/i;               # any whitespace is questionable
114
115     # Let's assume it's an IP address now, and get it into 32 bits.
116     # Uf at any time something doesn't look like a number, then it's
117     # probably a hostname and we've already either whitelisted or
118     # blacklisted those, so we'll just say it's okay and it'll come
119     # back here later when the resolver finds an IP address.
120     my @parts = split(/\./, $host);
121     return 0 if @parts > 4;
122
123     # un-octal/un-hex the parts, or return if there's a non-numeric part
124     my $overflow_flag = 0;
125     foreach (@parts) {
126         return 0 unless /^\d+$/ || /^0x[a-f\d]+$/;
127         local $SIG{__WARN__} = sub { $overflow_flag = 1; };
128         $_ = oct($_) if /^0/;
129     }
130
131     # a purely numeric address shouldn't overflow.
132     return 1 if $overflow_flag;
133
134     my $addr;  # network order packed IP address
135
136     if (@parts == 1) {
137         # a - 32 bits
138         return 1 if
139             $parts[0] > 0xffffffff;
140         $addr = pack("N", $parts[0]);
141     } elsif (@parts == 2) {
142         # a.b - 8.24 bits
143         return 1 if
144             $parts[0] > 0xff ||
145             $parts[1] > 0xffffff;
146         $addr = pack("N", $parts[0] << 24 | $parts[1]);
147     } elsif (@parts == 3) {
148         # a.b.c - 8.8.16 bits
149         return 1 if
150             $parts[0] > 0xff ||
151             $parts[1] > 0xff ||
152             $parts[2] > 0xffff;
153         $addr = pack("N", $parts[0] << 24 | $parts[1] << 16 | $parts[2]);
154     } else {
155         # a.b.c.d - 8.8.8.8 bits
156         return 1 if
157             $parts[0] > 0xff ||
158             $parts[1] > 0xff ||
159             $parts[2] > 0xff ||
160             $parts[3] > 0xff;
161         $addr = pack("N", $parts[0] << 24 | $parts[1] << 16 | $parts[2] << 8 | $parts[3]);
162     }
163
164     my $haddr = unpack("N", $addr); # host order IP address
165     return 1 if
166         ($haddr & 0xFF000000) == 0x00000000 || # 0.0.0.0/8
167         ($haddr & 0xFF000000) == 0x0A000000 || # 10.0.0.0/8
168         ($haddr & 0xFF000000) == 0x7F000000 || # 127.0.0.0/8
169         ($haddr & 0xFFF00000) == 0xAC100000 || # 172.16.0.0/12
170         ($haddr & 0xFFFF0000) == 0xA9FE0000 || # 169.254.0.0/16
171         ($haddr & 0xFFFF0000) == 0xC0A80000 || # 192.168.0.0/16
172         ($haddr & 0xFFFFFF00) == 0xC0000200 || # 192.0.2.0/24  "TEST-NET" docs/example code
173         ($haddr & 0xFFFFFF00) == 0xC0586300 || # 192.88.99.0/24 6to4 relay anycast addresses
174          $haddr               == 0xFFFFFFFF || # 255.255.255.255
175         ($haddr & 0xF0000000) == 0xE0000000;  # multicast addresses
176
177     # as final IP address check, pass in the canonical a.b.c.d decimal form
178     # to the blacklisted host check to see if matches as bad there.
179     my $can_ip = join(".", map { ord } split //, $addr);
180     return 1 if $self->_host_list_match("blocked_hosts", $can_ip);
181
182     # looks like an okay IP address
183     return 0;
184 }
185
186 sub request {
187     my ($self, $req, $arg, $size, $previous) = @_;
188
189     # walk back to the first request, and set our _time_begin to its _time_begin, or if
190     # we're the first, then use current time.  used by LWPx::Protocol::http_paranoid
191     my $first_res = $previous;  # previous is the previous response that invoked this request
192     $first_res = $first_res->previous while $first_res && $first_res->previous;
193     $req->{_time_begin} = $first_res ? $first_res->request->{_time_begin} : time();
194
195     my $host = $req->uri->host;
196     if ($self->_bad_host($host)) {
197         my $err_res = HTTP::Response->new(403, "Unauthorized access to blocked host");
198         $err_res->request($req);
199         $err_res->header("Client-Date" => HTTP::Date::time2str(time));
200         $err_res->header("Client-Warning" => "Internal response");
201         $err_res->header("Content-Type" => "text/plain");
202         $err_res->content("403 Unauthorized access to blocked host\n");
203         return $err_res;
204     }
205
206     if (my $pp = $self->{paranoid_proxy}) {
207         $req->uri("$pp?url="   . eurl($req->uri) .
208                   "&timeout="  . ($self->{timeout}  + 0) .
209                   "&max_size=" . ($self->{max_size} + 0));
210     }
211
212     return $self->SUPER::request($req, $arg, $size, $previous);
213 }
214
215 # taken from LWP::UserAgent and modified slightly.  (proxy support removed,
216 # and map http and https schemes to separate protocol handlers)
217 sub send_request
218 {
219     my ($self, $request, $arg, $size) = @_;
220     $self->_request_sanity_check($request);
221
222     my ($method, $url) = ($request->method, $request->uri);
223
224     local($SIG{__DIE__});  # protect against user defined die handlers
225
226     # Check that we have a METHOD and a URL first
227     return _new_response($request, &HTTP::Status::RC_BAD_REQUEST, "Method missing")
228         unless $method;
229     return _new_response($request, &HTTP::Status::RC_BAD_REQUEST, "URL missing")
230         unless $url;
231     return _new_response($request, &HTTP::Status::RC_BAD_REQUEST, "URL must be absolute")
232         unless $url->scheme;
233     return _new_response($request, &HTTP::Status::RC_BAD_REQUEST,
234                          "ParanoidAgent doesn't support going through proxies.  ".
235                          "In that case, do your paranoia at your proxy instead.")
236         if $self->_need_proxy($url);
237
238     my $scheme = $url->scheme;
239     return _new_response($request, &HTTP::Status::RC_BAD_REQUEST, "Only http and https are supported by ParanoidAgent")
240         unless $scheme eq "http" || $scheme eq "https";
241
242     LWP::Debug::trace("$method $url");
243
244     my $protocol;
245
246     {
247       # Honor object-specific restrictions by forcing protocol objects
248       #  into class LWP::Protocol::nogo.
249         my $x;
250         if($x       = $self->protocols_allowed) {
251             if(grep lc($_) eq $scheme, @$x) {
252                 LWP::Debug::trace("$scheme URLs are among $self\'s allowed protocols (@$x)");
253               }
254             else {
255                 LWP::Debug::trace("$scheme URLs aren't among $self\'s allowed protocols (@$x)");
256                   require LWP::Protocol::nogo;
257                   $protocol = LWP::Protocol::nogo->new;
258               }
259         }
260         elsif ($x = $self->protocols_forbidden) {
261             if(grep lc($_) eq $scheme, @$x) {
262                 LWP::Debug::trace("$scheme URLs are among $self\'s forbidden protocols (@$x)");
263                   require LWP::Protocol::nogo;
264                   $protocol = LWP::Protocol::nogo->new;
265               }
266             else {
267                 LWP::Debug::trace("$scheme URLs aren't among $self\'s forbidden protocols (@$x)");
268               }
269         }
270       # else fall thru and create the protocol object normally
271     }
272
273     unless ($protocol) {
274         LWP::Protocol::implementor("${scheme}_paranoid",  "LWPx::Protocol::${scheme}_paranoid");
275         eval "require LWPx::Protocol::${scheme}_paranoid;";
276         if ($@) {
277             $@ =~ s/ at .* line \d+.*//s;  # remove file/line number
278             my $response =  _new_response($request, &HTTP::Status::RC_NOT_IMPLEMENTED, $@);
279             return $response;
280         }
281
282         $protocol = eval { LWP::Protocol::create($scheme eq "http" ? "http_paranoid" : "https_paranoid", $self) };
283         if ($@) {
284             $@ =~ s/ at .* line \d+.*//s;  # remove file/line number
285             my $response =  _new_response($request, &HTTP::Status::RC_NOT_IMPLEMENTED, $@);
286             if ($scheme eq "https") {
287                 $response->message($response->message . " (Crypt::SSLeay not installed)");
288                 $response->content_type("text/plain");
289                 $response->content(<<EOT);
290 LWP will support https URLs if the Crypt::SSLeay module is installed.
291 More information at <http://www.linpro.no/lwp/libwww-perl/README.SSL>.
292 EOT
293 }
294             return $response;
295         }
296     }
297
298     # Extract fields that will be used below
299     my ($timeout, $cookie_jar, $use_eval, $parse_head, $max_size) =
300         @{$self}{qw(timeout cookie_jar use_eval parse_head max_size)};
301
302     my $response;
303     my $proxy = undef;
304     if ($use_eval) {
305         # we eval, and turn dies into responses below
306         eval {
307             $response = $protocol->request($request, $proxy,
308                                            $arg, $size, $timeout);
309         };
310         if ($@) {
311             $@ =~ s/ at .* line \d+.*//s;  # remove file/line number
312             $response = _new_response($request,
313                                       &HTTP::Status::RC_INTERNAL_SERVER_ERROR,
314                                       $@);
315         }
316     }
317     else {
318         $response = $protocol->request($request, $proxy,
319                                        $arg, $size, $timeout);
320         # XXX: Should we die unless $response->is_success ???
321     }
322
323     $response->request($request);  # record request for reference
324     $cookie_jar->extract_cookies($response) if $cookie_jar;
325     $response->header("Client-Date" => HTTP::Date::time2str(time));
326     return $response;
327 }
328
329 # blocked hostnames, compiled patterns, or subrefs
330 sub blocked_hosts
331 {
332     my $self = shift;
333     if (@_) {
334         my @hosts = @_;
335         $self->{'blocked_hosts'} = \@hosts;
336         return;
337     }
338     return @{ $self->{'blocked_hosts'} };
339 }
340
341 # whitelisted hostnames, compiled patterns, or subrefs
342 sub whitelisted_hosts
343 {
344     my $self = shift;
345     if (@_) {
346         my @hosts = @_;
347         $self->{'whitelisted_hosts'} = \@hosts;
348         return;
349     }
350     return @{ $self->{'whitelisted_hosts'} };
351 }
352
353 # get/set Net::DNS resolver object
354 sub resolver
355 {
356     my $self = shift;
357     if (@_) {
358         $self->{'resolver'} = shift;
359         require UNIVERSAL ;
360         die "Not a Net::DNS::Resolver object" unless
361             UNIVERSAL::isa($self->{'resolver'}, "Net::DNS::Resolver");
362     }
363     return $self->{'resolver'} ||= Net::DNS::Resolver->new;
364 }
365
366 # Taken directly from LWP::UserAgent because it was private there, and we can't depend on it
367 # staying there in future versions:  needed by our modified version of send_request
368 sub _need_proxy
369 {
370     my($self, $url) = @_;
371     $url = $HTTP::URI_CLASS->new($url) unless ref $url;
372
373     my $scheme = $url->scheme || return;
374     if (my $proxy = $self->{'proxy'}{$scheme}) {
375         if (@{ $self->{'no_proxy'} }) {
376             if (my $host = eval { $url->host }) {
377                 for my $domain (@{ $self->{'no_proxy'} }) {
378                     if ($host =~ /\Q$domain\E$/) {
379                         LWP::Debug::trace("no_proxy configured");
380                           return;
381                       }
382                 }
383             }
384         }
385         LWP::Debug::debug("Proxied to $proxy");
386         return $HTTP::URI_CLASS->new($proxy);
387     }
388     LWP::Debug::debug('Not proxied');
389     undef;
390 }
391
392 # Taken directly from LWP::UserAgent because it was private there, and we can't depend on it
393 # staying there in future versions:  needed by our modified version of send_request
394 sub _request_sanity_check {
395     my($self, $request) = @_;
396     # some sanity checking
397     if (defined $request) {
398         if (ref $request) {
399             Carp::croak("You need a request object, not a " . ref($request) . " object")
400               if ref($request) eq 'ARRAY' or ref($request) eq 'HASH' or
401               !$request->can('method') or !$request->can('uri');
402           }
403         else {
404             Carp::croak("You need a request object, not '$request'");
405           }
406     }
407     else {
408         Carp::croak("No request object passed in");
409       }
410 }
411
412 # Taken directly from LWP::UserAgent because it was private there, and we can't depend on it
413 # staying there in future versions:  needed by our modified version of send_request
414 sub _new_response {
415     my($request, $code, $message) = @_;
416     my $response = HTTP::Response->new($code, $message);
417     $response->request($request);
418     $response->header("Client-Date" => HTTP::Date::time2str(time));
419     $response->header("Client-Warning" => "Internal response");
420     $response->header("Content-Type" => "text/plain");
421     $response->content("$code $message\n");
422     return $response;
423 }
424
425 sub eurl {
426     my $a = $_[0];
427     $a =~ s/([^a-zA-Z0-9_\,\-.\/\\\: ])/uc sprintf("%%%02x",ord($1))/eg;
428     $a =~ tr/ /+/;
429     return $a;
430 }
431
432 1;
433
434 __END__
435
436 =head1 NAME
437
438 LWPx::ParanoidAgent - subclass of LWP::UserAgent that protects you from harm
439
440 =head1 SYNOPSIS
441
442  require LWPx::ParanoidAgent;
443
444  my $ua = LWPx::ParanoidAgent->new;
445
446  # this is 10 seconds overall, from start to finish.  not just between
447  # socket reads.  and it includes all redirects.  so attackers telling
448  # you to download from a malicious tarpit webserver can only stall
449  # you for $n seconds
450
451  $ua->timeout(10);
452
453  # setup extra block lists, in addition to the always-enforced blocking
454  # of private IP addresses, loopbacks, and multicast addresses
455
456  $ua->blocked_hosts(
457     "foo.com",
458     qr/\.internal\.company\.com$/i,
459     sub { my $host = shift;  return 1 if is_bad($host); },
460  );
461
462  $ua->whitelisted_hosts(
463     "brad.lj",
464     qr/^192\.168\.64\.3?/,
465     sub { ... },
466  );
467
468  # get/set the DNS resolver object that's used
469  my $resolver = $ua->resolver;
470  $ua->resolver(Net::DNS::Resolver->new(...));
471
472  # and then just like a normal LWP::UserAgent, because it is one.
473  my $response = $ua->get('http://search.cpan.org/');
474  ...
475  if ($response->is_success) {
476      print $response->content;  # or whatever
477  }
478  else {
479      die $response->status_line;
480  }
481
482 =head1 DESCRIPTION
483
484 The C<LWPx::ParanoidAgent> is a class subclassing C<LWP::UserAgent>,
485 but paranoid against attackers.  It's to be used when you're fetching
486 a remote resource on behalf of a possibly malicious user.
487
488 This class can do whatever C<LWP::UserAgent> can (callbacks, uploads from
489 files, etc), except proxy support is explicitly removed, because in
490 that case you should do your paranoia at your proxy.
491
492 Also, the schemes are limited to http and https, which are mapped to
493 C<LWPx::Protocol::http_paranoid> and
494 C<LWPx::Protocol::https_paranoid>, respectively, which are forked
495 versions of the same ones without the "_paranoid".  Subclassing them
496 didn't look possible, as they were essentially just one huge function.
497
498 This class protects you from connecting to internal IP ranges (unless you
499 whitelist th