| 1 |
|
|---|
| 2 |
|
|---|
| 3 |
package URI::Fetch; |
|---|
| 4 |
use strict; |
|---|
| 5 |
use base qw( Class::ErrorHandler ); |
|---|
| 6 |
|
|---|
| 7 |
use LWP::UserAgent; |
|---|
| 8 |
use Carp qw( croak ); |
|---|
| 9 |
use URI; |
|---|
| 10 |
use URI::Fetch::Response; |
|---|
| 11 |
|
|---|
| 12 |
our $VERSION = '0.08'; |
|---|
| 13 |
|
|---|
| 14 |
our $HAS_ZLIB; |
|---|
| 15 |
BEGIN { |
|---|
| 16 |
$HAS_ZLIB = eval "use Compress::Zlib (); 1;"; |
|---|
| 17 |
} |
|---|
| 18 |
|
|---|
| 19 |
use constant URI_OK => 200; |
|---|
| 20 |
use constant URI_MOVED_PERMANENTLY => 301; |
|---|
| 21 |
use constant URI_NOT_MODIFIED => 304; |
|---|
| 22 |
use constant URI_GONE => 410; |
|---|
| 23 |
|
|---|
| 24 |
sub fetch { |
|---|
| 25 |
my $class = shift; |
|---|
| 26 |
my($uri, %param) = @_; |
|---|
| 27 |
|
|---|
| 28 |
|
|---|
| 29 |
my $cache = delete $param{Cache}; |
|---|
| 30 |
my $ua = delete $param{UserAgent}; |
|---|
| 31 |
my $p_etag = delete $param{ETag}; |
|---|
| 32 |
my $p_lastmod = delete $param{LastModified}; |
|---|
| 33 |
my $content_hook = delete $param{ContentAlterHook}; |
|---|
| 34 |
my $p_no_net = delete $param{NoNetwork}; |
|---|
| 35 |
my $p_cache_grep = delete $param{CacheEntryGrep}; |
|---|
| 36 |
my $freeze = delete $param{Freeze}; |
|---|
| 37 |
my $thaw = delete $param{Thaw}; |
|---|
| 38 |
my $force = delete $param{ForceResponse}; |
|---|
| 39 |
croak("Unknown parameters: " . join(", ", keys %param)) |
|---|
| 40 |
if %param; |
|---|
| 41 |
|
|---|
| 42 |
my $ref; |
|---|
| 43 |
if ($cache) { |
|---|
| 44 |
unless ($freeze && $thaw) { |
|---|
| 45 |
require Storable; |
|---|
| 46 |
$thaw = \&Storable::thaw; |
|---|
| 47 |
$freeze = \&Storable::freeze; |
|---|
| 48 |
} |
|---|
| 49 |
if (my $blob = $cache->get($uri)) { |
|---|
| 50 |
$ref = $thaw->($blob); |
|---|
| 51 |
} |
|---|
| 52 |
} |
|---|
| 53 |
|
|---|
| 54 |
|
|---|
| 55 |
if ($p_no_net) { |
|---|
| 56 |
croak("Invalid NoNetworkValue (negative)") if $p_no_net < 0; |
|---|
| 57 |
if ($ref && ($p_no_net == 1 || $ref->{CacheTime} > time() - $p_no_net)) { |
|---|
| 58 |
|
|---|
| 59 |
my $fetch = URI::Fetch::Response->new; |
|---|
| 60 |
$fetch->status(URI_OK); |
|---|
| 61 |
$fetch->content($ref->{Content}); |
|---|
| 62 |
$fetch->etag($ref->{ETag}); |
|---|
| 63 |
$fetch->last_modified($ref->{LastModified}); |
|---|
| 64 |
$fetch->content_type($ref->{ContentType}); |
|---|
| 65 |
return $fetch; |
|---|
| 66 |
} |
|---|
| 67 |
return undef if $p_no_net == 1; |
|---|
| 68 |
} |
|---|
| 69 |
|
|---|
| 70 |
$ua ||= LWP::UserAgent->new; |
|---|
| 71 |
$ua->agent(join '/', $class, $class->VERSION) |
|---|
| 72 |
if $ua->agent =~ /^libwww-perl/; |
|---|
| 73 |
|
|---|
| 74 |
my $req = HTTP::Request->new(GET => $uri); |
|---|
| 75 |
if ($HAS_ZLIB) { |
|---|
| 76 |
$req->header('Accept-Encoding', 'gzip'); |
|---|
| 77 |
} |
|---|
| 78 |
if (my $etag = ($p_etag || $ref->{ETag})) { |
|---|
| 79 |
$req->header('If-None-Match', $etag); |
|---|
| 80 |
} |
|---|
| 81 |
if (my $ts = ($p_lastmod || $ref->{LastModified})) { |
|---|
| 82 |
$req->if_modified_since($ts); |
|---|
| 83 |
} |
|---|
| 84 |
|
|---|
| 85 |
my $res = $ua->request($req); |
|---|
| 86 |
my $fetch = URI::Fetch::Response->new; |
|---|
| 87 |
$fetch->uri($uri); |
|---|
| 88 |
$fetch->http_status($res->code); |
|---|
| 89 |
$fetch->http_response($res); |
|---|
| 90 |
$fetch->content_type($res->header('Content-Type')); |
|---|
| 91 |
if ($res->previous && $res->previous->code == HTTP::Status::RC_MOVED_PERMANENTLY()) { |
|---|
| 92 |
$fetch->status(URI_MOVED_PERMANENTLY); |
|---|
| 93 |
$fetch->uri($res->previous->header('Location')); |
|---|
| 94 |
} elsif ($res->code == HTTP::Status::RC_GONE()) { |
|---|
| 95 |
$fetch->status(URI_GONE); |
|---|
| 96 |
$fetch->uri(undef); |
|---|
| 97 |
return $fetch; |
|---|
| 98 |
} elsif ($res->code == HTTP::Status::RC_NOT_MODIFIED()) { |
|---|
| 99 |
$fetch->status(URI_NOT_MODIFIED); |
|---|
| 100 |
$fetch->content($ref->{Content}); |
|---|
| 101 |
$fetch->etag($ref->{ETag}); |
|---|
| 102 |
$fetch->last_modified($ref->{LastModified}); |
|---|
| 103 |
$fetch->content_type($ref->{ContentType}); |
|---|
| 104 |
return $fetch; |
|---|
| 105 |
} elsif (!$res->is_success) { |
|---|
| 106 |
return $force ? $fetch : $class->error($res->message); |
|---|
| 107 |
|
|---|
| 108 |
} else { |
|---|
| 109 |
$fetch->status(URI_OK); |
|---|
| 110 |
} |
|---|
| 111 |
$fetch->last_modified($res->last_modified); |
|---|
| 112 |
$fetch->etag($res->header('ETag')); |
|---|
| 113 |
my $content = $res->content; |
|---|
| 114 |
if ($res->content_encoding && $res->content_encoding eq 'gzip') { |
|---|
| 115 |
$content = Compress::Zlib::memGunzip($content); |
|---|
| 116 |
} |
|---|
| 117 |
|
|---|
| 118 |
|
|---|
| 119 |
|
|---|
| 120 |
|
|---|
| 121 |
|
|---|
| 122 |
if ($content_hook) { |
|---|
| 123 |
croak("ContentAlterHook is not a subref") unless ref $content_hook eq "CODE"; |
|---|
| 124 |
$content_hook->(\$content); |
|---|
| 125 |
} |
|---|
| 126 |
|
|---|
| 127 |
$fetch->content($content); |
|---|
| 128 |
|
|---|
| 129 |
|
|---|
| 130 |
|
|---|
| 131 |
if ($cache && |
|---|
| 132 |
($p_cache_grep ? $p_cache_grep->($fetch) : 1)) { |
|---|
| 133 |
|
|---|
| 134 |
$cache->set($fetch->uri, $freeze->({ |
|---|
| 135 |
ETag => $fetch->etag, |
|---|
| 136 |
LastModified => $fetch->last_modified, |
|---|
| 137 |
Content => $fetch->content, |
|---|
| 138 |
CacheTime => time(), |
|---|
| 139 |
ContentType => $fetch->content_type, |
|---|
| 140 |
})); |
|---|
| 141 |
} |
|---|
| 142 |
$fetch; |
|---|
| 143 |
} |
|---|
| 144 |
|
|---|
| 145 |
1; |
|---|
| 146 |
__END__ |
|---|
| 147 |
|
|---|
| 148 |
=head1 NAME |
|---|
| 149 |
|
|---|
| 150 |
URI::Fetch - Smart URI fetching/caching |
|---|
| 151 |
|
|---|
| 152 |
=head1 SYNOPSIS |
|---|
| 153 |
|
|---|
| 154 |
use URI::Fetch; |
|---|
| 155 |
|
|---|
| 156 |
## Simple fetch. |
|---|
| 157 |
my $res = URI::Fetch->fetch('http://example.com/atom.xml') |
|---|
| 158 |
or die URI::Fetch->errstr; |
|---|
| 159 |
|
|---|
| 160 |
## Fetch using specified ETag and Last-Modified headers. |
|---|
| 161 |
my $res = URI::Fetch->fetch('http://example.com/atom.xml', |
|---|
| 162 |
ETag => '123-ABC', |
|---|
| 163 |
LastModified => time - 3600, |
|---|
| 164 |
) |
|---|
| 165 |
or die URI::Fetch->errstr; |
|---|
| 166 |
|
|---|
| 167 |
## Fetch using an on-disk cache that URI::Fetch manages for you. |
|---|
| 168 |
my $cache = Cache::File->new( cache_root => '/tmp/cache' ); |
|---|
| 169 |
my $res = URI::Fetch->fetch('http://example.com/atom.xml', |
|---|
| 170 |
Cache => $cache |
|---|
| 171 |
) |
|---|
| 172 |
or die URI::Fetch->errstr; |
|---|
| 173 |
|
|---|
| 174 |
=head1 DESCRIPTION |
|---|
| 175 |
|
|---|
| 176 |
I<URI::Fetch> is a smart client for fetching HTTP pages, notably |
|---|
| 177 |
syndication feeds (RSS, Atom, and others), in an intelligent, |
|---|
| 178 |
bandwidth- and time-saving way. That means: |
|---|
| 179 |
|
|---|
| 180 |
=over 4 |
|---|
| 181 |
|
|---|
| 182 |
=item * GZIP support |
|---|
| 183 |
|
|---|
| 184 |
If you have I<Compress::Zlib> installed, I<URI::Fetch> will automatically |
|---|
| 185 |
try to download a compressed version of the content, saving bandwidth (and |
|---|
| 186 |
time). |
|---|
| 187 |
|
|---|
| 188 |
=item * I<Last-Modified> and I<ETag> support |
|---|
| 189 |
|
|---|
| 190 |
If you use a local cache (see the I<Cache> parameter to I<fetch>), |
|---|
| 191 |
I<URI::Fetch> will keep track of the I<Last-Modified> and I<ETag> headers |
|---|
| 192 |
from the server, allowing you to only download pages that have been |
|---|
| 193 |
modified since the last time you checked. |
|---|
| 194 |
|
|---|
| 195 |
=item * Proper understanding of HTTP error codes |
|---|
| 196 |
|
|---|
| 197 |
Certain HTTP error codes are special, particularly when fetching syndication |
|---|
| 198 |
feeds, and well-written clients should pay special attention to them. |
|---|
| 199 |
I<URI::Fetch> can only do so much for you in this regard, but it gives |
|---|
| 200 |
you the tools to be a well-written client. |
|---|
| 201 |
|
|---|
| 202 |
The response from I<fetch> gives you the raw HTTP response code, along with |
|---|
| 203 |
special handling of 4 codes: |
|---|
| 204 |
|
|---|
| 205 |
=over 4 |
|---|
| 206 |
|
|---|
| 207 |
=item * 200 (OK) |
|---|
| 208 |
|
|---|
| 209 |
Signals that the content of a page/feed was retrieved |
|---|
| 210 |
successfully. |
|---|
| 211 |
|
|---|
| 212 |
=item * 301 (Moved Permanently) |
|---|
| 213 |
|
|---|
| 214 |
Signals that a page/feed has moved permanently, and that |
|---|
| 215 |
your database of feeds should be updated to reflect the new |
|---|
| 216 |
URI. |
|---|
| 217 |
|
|---|
| 218 |
=item * 304 (Not Modified) |
|---|
| 219 |
|
|---|
| 220 |
Signals that a page/feed has not changed since it was last |
|---|
| 221 |
fetched. |
|---|
| 222 |
|
|---|
| 223 |
=item * 410 (Gone) |
|---|
| 224 |
|
|---|
| 225 |
Signals that a page/feed is gone and will never be coming back, |
|---|
| 226 |
so you should stop trying to fetch it. |
|---|
| 227 |
|
|---|
| 228 |
=back |
|---|
| 229 |
|
|---|
| 230 |
=head1 USAGE |
|---|
| 231 |
|
|---|
| 232 |
=head2 URI::Fetch->fetch($uri, %param) |
|---|
| 233 |
|
|---|
| 234 |
Fetches a page identified by the URI I<$uri>. |
|---|
| 235 |
|
|---|
| 236 |
On success, returns a I<URI::Fetch::Response> object; on failure, returns |
|---|
| 237 |
C<undef>. |
|---|
| 238 |
|
|---|
| 239 |
I<%param> can contain: |
|---|
| 240 |
|
|---|
| 241 |
=over 4 |
|---|
| 242 |
|
|---|
| 243 |
=item * LastModified |
|---|
| 244 |
|
|---|
| 245 |
=item * ETag |
|---|
| 246 |
|
|---|
| 247 |
I<LastModified> and I<ETag> can be supplied to force the server to only |
|---|
| 248 |
return the full page if it's changed since the last request. If you're |
|---|
| 249 |
writing your own feed client, this is recommended practice, because it |
|---|
| 250 |
limits both your bandwidth use and the server's. |
|---|
| 251 |
|
|---|
| 252 |
If you'd rather not have to store the I<LastModified> time and I<ETag> |
|---|
| 253 |
yourself, see the I<Cache> parameter below (and the L<SYNOPSIS> above). |
|---|
| 254 |
|
|---|
| 255 |
=item * Cache |
|---|
| 256 |
|
|---|
| 257 |
If you'd like I<URI::Fetch> to cache responses between requests, provide |
|---|
| 258 |
the I<Cache> parameter with an object supporting the L<Cache> API (e.g. |
|---|
| 259 |
I<Cache::File>, I<Cache::Memory>). Specifically, an object that supports |
|---|
| 260 |
C<$cache-E<gt>get($key)> and C<$cache-E<gt>set($key, $value, $expires)>. |
|---|
| 261 |
|
|---|
| 262 |
If supplied, I<URI::Fetch> will store the page content, ETag, and |
|---|
| 263 |
last-modified time of the response in the cache, and will pull the |
|---|
| 264 |
content from the cache on subsequent requests if the page returns a |
|---|
| 265 |
Not-Modified response. |
|---|
| 266 |
|
|---|
| 267 |
=item * UserAgent |
|---|
| 268 |
|
|---|
| 269 |
Optional. You may provide your own LWP::UserAgent instance. Look |
|---|
| 270 |
into L<LWPx::ParanoidUserAgent> if you're fetching URLs given to you |
|---|
| 271 |
by possibly malicious parties. |
|---|
| 272 |
|
|---|
| 273 |
=item * NoNetwork |
|---|
| 274 |
|
|---|
| 275 |
Optional. Controls the interaction between the cache and HTTP |
|---|
| 276 |
requests with If-Modified-Since/If-None-Match headers. Possible |
|---|
| 277 |
behaviors are: |
|---|
| 278 |
|
|---|
| 279 |
=over |
|---|
| 280 |
|
|---|
| 281 |
=item false (default) |
|---|
| 282 |
|
|---|
| 283 |
If a page is in the cache, the origin HTTP server is always checked |
|---|
| 284 |
for a fresher copy with an If-Modified-Since and/or If-None-Match |
|---|
| 285 |
header. |
|---|
| 286 |
|
|---|
| 287 |
=item C<1> |
|---|
| 288 |
|
|---|
| 289 |
If set to C<1>, the origin HTTP is never contacted, regardless of the |
|---|
| 290 |
page being in cache or not. If the page is missing from cache, the |
|---|
| 291 |
fetch method will return undef. If the page is in cache, that page |
|---|
| 292 |
will be returned, no matter how old it is. Note that setting this |
|---|
| 293 |
option means the L<URI::Fetch::Response> object will never have the |
|---|
| 294 |
http_response member set. |
|---|
| 295 |
|
|---|
| 296 |
=item C<N>, where N E<gt> 1 |
|---|
| 297 |
|
|---|
| 298 |
The origin HTTP server is not contacted B<if> the page is in cache |
|---|
| 299 |
B<and> the cached page was inserted in the last N seconds. If the |
|---|
| 300 |
cached copy is older than N seconds, a normal HTTP request (full or |
|---|
| 301 |
cache check) is done. |
|---|
| 302 |
|
|---|
| 303 |
=back |
|---|
| 304 |
|
|---|
| 305 |
=item * ContentAlterHook |
|---|
| 306 |
|
|---|
| 307 |
Optional. A subref that gets called with a scalar reference to your |
|---|
| 308 |
content so you can modify the content before it's returned and before |
|---|
| 309 |
it's put in cache. |
|---|
| 310 |
|
|---|
| 311 |
For instance, you may want to only cache the E<lt>headE<gt> section of |
|---|
| 312 |
an HTML document, or you may want to take a feed URL and cache only a |
|---|
| 313 |
pre-parsed version of it. If you modify the scalarref given to your |
|---|
| 314 |
hook and change it into a hashref, scalarref, or some blessed object, |
|---|
| 315 |
that same value will be returned to you later on not-modified |
|---|
| 316 |
responses. |
|---|
| 317 |
|
|---|
| 318 |
=item * CacheEntryGrep |
|---|
| 319 |
|
|---|
| 320 |
Optional. A subref that gets called with the I<URI::Fetch::Response> |
|---|
| 321 |
object about to be cached (with the contents already possibly transformed by |
|---|
| 322 |
your C<ContentAlterHook>). If your subref returns true, the page goes |
|---|
| 323 |
into the cache. If false, it doesn't. |
|---|
| 324 |
|
|---|
| 325 |
=item * Freeze |
|---|
| 326 |
|
|---|
| 327 |
=item * Thaw |
|---|
| 328 |
|
|---|
| 329 |
Optional. Subrefs that get called to serialize and deserialize, respectively, |
|---|
| 330 |
the data that will be cached. The cached data should be assumed to be an |
|---|
| 331 |
arbitrary Perl data structure, containing (potentially) references to |
|---|
| 332 |
arrays, hashes, etc. |
|---|
| 333 |
|
|---|
| 334 |
Freeze should serialize the structure into a scalar; Thaw should |
|---|
| 335 |
deserialize the scalar into a data structure. |
|---|
| 336 |
|
|---|
| 337 |
By default, I<Storable> will be used for freezing and thawing the cached |
|---|
| 338 |
data structure. |
|---|
| 339 |
|
|---|
| 340 |
=item * ForceResponse |
|---|
| 341 |
|
|---|
| 342 |
Optional. A boolean that indicates a I<URI::Fetch::Response> |
|---|
| 343 |
should be returned regardless of the HTTP status. By |
|---|
| 344 |
default C<undef> is returned when a response is not a |
|---|
| 345 |
"success" (200 codes) or one of the recognized HTTP status |
|---|
| 346 |
codes listed above. The HTTP status message can then be retreived |
|---|
| 347 |
using the C<errstr> method on the class. |
|---|
| 348 |
|
|---|
| 349 |
=back |
|---|
| 350 |
|
|---|
| 351 |
=head1 LICENSE |
|---|
| 352 |
|
|---|
| 353 |
I<URI::Fetch> is free software; you may redistribute it and/or modify it |
|---|
| 354 |
under the same terms as Perl itself. |
|---|
| 355 |
|
|---|
| 356 |
=head1 AUTHOR & COPYRIGHT |
|---|
| 357 |
|
|---|
| 358 |
Except where otherwise noted, I<URI::Fetch> is Copyright 2004 Benjamin |
|---|
| 359 |
Trott, ben+cpan@stupidfool.org. All rights reserved. |
|---|
| 360 |
|
|---|
| 361 |
=cut |
|---|
| 362 |
|
|---|