root/feedmelinks/lib/URI/Fetch.pm

Revision 1448, 11.0 kB (checked in by jm3, 2 years ago)

new spammer blacklist, perl libs, and better splinker hunting tools

Line 
1 # $Id: Fetch.pm 1952 2006-07-25 05:41:24Z btrott $
2
3 package URI::Fetch;
4 use strict;
5 use base qw( Class::ErrorHandler );
6
7 use LWP::UserAgent;
8 use Carp qw( croak );
9 use URI;
10 use URI::Fetch::Response;
11
12 our $VERSION = '0.08';
13
14 our $HAS_ZLIB;
15 BEGIN {
16     $HAS_ZLIB = eval "use Compress::Zlib (); 1;";
17 }
18
19 use constant URI_OK                => 200;
20 use constant URI_MOVED_PERMANENTLY => 301;
21 use constant URI_NOT_MODIFIED      => 304;
22 use constant URI_GONE              => 410;
23
24 sub fetch {
25     my $class = shift;
26     my($uri, %param) = @_;
27
28     # get user parameters
29     my $cache        = delete $param{Cache};
30     my $ua           = delete $param{UserAgent};
31     my $p_etag       = delete $param{ETag};
32     my $p_lastmod    = delete $param{LastModified};
33     my $content_hook = delete $param{ContentAlterHook};
34     my $p_no_net     = delete $param{NoNetwork};
35     my $p_cache_grep = delete $param{CacheEntryGrep};
36     my $freeze       = delete $param{Freeze};
37     my $thaw         = delete $param{Thaw};
38     my $force        = delete $param{ForceResponse};
39     croak("Unknown parameters: " . join(", ", keys %param))
40         if %param;
41
42     my $ref;
43     if ($cache) {
44         unless ($freeze && $thaw) {
45             require Storable;
46             $thaw = \&Storable::thaw;
47             $freeze = \&Storable::freeze;
48         }
49         if (my $blob = $cache->get($uri)) {
50             $ref = $thaw->($blob);
51         }
52     }
53
54     # NoNetwork support (see pod docs below for logic clarification)
55     if ($p_no_net) {
56         croak("Invalid NoNetworkValue (negative)") if $p_no_net < 0;
57         if ($ref && ($p_no_net == 1 || $ref->{CacheTime} > time() - $p_no_net)) {
58
59             my $fetch = URI::Fetch::Response->new;
60             $fetch->status(URI_OK);
61             $fetch->content($ref->{Content});
62             $fetch->etag($ref->{ETag});
63             $fetch->last_modified($ref->{LastModified});
64             $fetch->content_type($ref->{ContentType});
65             return $fetch;
66         }
67         return undef if $p_no_net == 1;
68     }
69
70     $ua ||= LWP::UserAgent->new;
71     $ua->agent(join '/', $class, $class->VERSION)
72         if $ua->agent =~ /^libwww-perl/;
73
74     my $req = HTTP::Request->new(GET => $uri);
75     if ($HAS_ZLIB) {
76         $req->header('Accept-Encoding', 'gzip');
77     }
78     if (my $etag = ($p_etag || $ref->{ETag})) {
79         $req->header('If-None-Match', $etag);
80     }
81     if (my $ts = ($p_lastmod || $ref->{LastModified})) {
82         $req->if_modified_since($ts);
83     }
84
85     my $res = $ua->request($req);
86     my $fetch = URI::Fetch::Response->new;
87     $fetch->uri($uri);
88     $fetch->http_status($res->code);
89     $fetch->http_response($res);
90     $fetch->content_type($res->header('Content-Type'));
91     if ($res->previous && $res->previous->code == HTTP::Status::RC_MOVED_PERMANENTLY()) {
92         $fetch->status(URI_MOVED_PERMANENTLY);
93         $fetch->uri($res->previous->header('Location'));
94     } elsif ($res->code == HTTP::Status::RC_GONE()) {
95         $fetch->status(URI_GONE);
96         $fetch->uri(undef);
97         return $fetch;
98     } elsif ($res->code == HTTP::Status::RC_NOT_MODIFIED()) {
99         $fetch->status(URI_NOT_MODIFIED);
100         $fetch->content($ref->{Content});
101         $fetch->etag($ref->{ETag});
102         $fetch->last_modified($ref->{LastModified});
103         $fetch->content_type($ref->{ContentType});
104         return $fetch;
105     } elsif (!$res->is_success) {
106         return $force ? $fetch : $class->error($res->message);
107        
108     } else {
109         $fetch->status(URI_OK);
110     }
111     $fetch->last_modified($res->last_modified);
112     $fetch->etag($res->header('ETag'));
113     my $content = $res->content;
114     if ($res->content_encoding && $res->content_encoding eq 'gzip') {
115         $content = Compress::Zlib::memGunzip($content);
116     }
117
118     # let caller-defined transform hook modify the result that'll be
119     # cached.  perhaps the caller only wants the <head> section of
120     # HTML, or wants to change the content to a parsed datastructure
121     # already serialized with Storable.
122     if ($content_hook) {
123         croak("ContentAlterHook is not a subref") unless ref $content_hook eq "CODE";
124         $content_hook->(\$content);
125     }
126
127     $fetch->content($content);
128
129     # cache by default, if there's a cache.  but let callers cancel
130     # the cache action by defining a cache grep hook
131     if ($cache &&
132         ($p_cache_grep ? $p_cache_grep->($fetch) : 1)) {
133
134         $cache->set($fetch->uri, $freeze->({
135             ETag         => $fetch->etag,
136             LastModified => $fetch->last_modified,
137             Content      => $fetch->content,
138             CacheTime    => time(),
139             ContentType  => $fetch->content_type,
140         }));
141     }
142     $fetch;
143 }
144
145 1;
146 __END__
147
148 =head1 NAME
149
150 URI::Fetch - Smart URI fetching/caching
151
152 =head1 SYNOPSIS
153
154     use URI::Fetch;
155
156     ## Simple fetch.
157     my $res = URI::Fetch->fetch('http://example.com/atom.xml')
158         or die URI::Fetch->errstr;
159
160     ## Fetch using specified ETag and Last-Modified headers.
161     my $res = URI::Fetch->fetch('http://example.com/atom.xml',
162             ETag => '123-ABC',
163             LastModified => time - 3600,
164     )
165         or die URI::Fetch->errstr;
166
167     ## Fetch using an on-disk cache that URI::Fetch manages for you.
168     my $cache = Cache::File->new( cache_root => '/tmp/cache' );
169     my $res = URI::Fetch->fetch('http://example.com/atom.xml',
170             Cache => $cache
171     )
172         or die URI::Fetch->errstr;
173
174 =head1 DESCRIPTION
175
176 I<URI::Fetch> is a smart client for fetching HTTP pages, notably
177 syndication feeds (RSS, Atom, and others), in an intelligent,
178 bandwidth- and time-saving way. That means:
179
180 =over 4
181
182 =item * GZIP support
183
184 If you have I<Compress::Zlib> installed, I<URI::Fetch> will automatically
185 try to download a compressed version of the content, saving bandwidth (and
186 time).
187
188 =item * I<Last-Modified> and I<ETag> support
189
190 If you use a local cache (see the I<Cache> parameter to I<fetch>),
191 I<URI::Fetch> will keep track of the I<Last-Modified> and I<ETag> headers
192 from the server, allowing you to only download pages that have been
193 modified since the last time you checked.
194
195 =item * Proper understanding of HTTP error codes
196
197 Certain HTTP error codes are special, particularly when fetching syndication
198 feeds, and well-written clients should pay special attention to them.
199 I<URI::Fetch> can only do so much for you in this regard, but it gives
200 you the tools to be a well-written client.
201
202 The response from I<fetch> gives you the raw HTTP response code, along with
203 special handling of 4 codes:
204
205 =over 4
206
207 =item * 200 (OK)
208
209 Signals that the content of a page/feed was retrieved
210 successfully.
211
212 =item * 301 (Moved Permanently)
213
214 Signals that a page/feed has moved permanently, and that
215 your database of feeds should be updated to reflect the new
216 URI.
217
218 =item * 304 (Not Modified)
219
220 Signals that a page/feed has not changed since it was last
221 fetched.
222
223 =item * 410 (Gone)
224
225 Signals that a page/feed is gone and will never be coming back,
226 so you should stop trying to fetch it.
227
228 =back
229
230 =head1 USAGE
231
232 =head2 URI::Fetch->fetch($uri, %param)
233
234 Fetches a page identified by the URI I<$uri>.
235
236 On success, returns a I<URI::Fetch::Response> object; on failure, returns
237 C<undef>.
238
239 I<%param> can contain:
240
241 =over 4
242
243 =item * LastModified
244
245 =item * ETag
246
247 I<LastModified> and I<ETag> can be supplied to force the server to only
248 return the full page if it's changed since the last request. If you're
249 writing your own feed client, this is recommended practice, because it
250 limits both your bandwidth use and the server's.
251
252 If you'd rather not have to store the I<LastModified> time and I<ETag>
253 yourself, see the I<Cache> parameter below (and the L<SYNOPSIS> above).
254
255 =item * Cache
256
257 If you'd like I<URI::Fetch> to cache responses between requests, provide
258 the I<Cache> parameter with an object supporting the L<Cache> API (e.g.
259 I<Cache::File>, I<Cache::Memory>). Specifically, an object that supports
260 C<$cache-E<gt>get($key)> and C<$cache-E<gt>set($key, $value, $expires)>.
261
262 If supplied, I<URI::Fetch> will store the page content, ETag, and
263 last-modified time of the response in the cache, and will pull the
264 content from the cache on subsequent requests if the page returns a
265 Not-Modified response.
266
267 =item * UserAgent
268
269 Optional.  You may provide your own LWP::UserAgent instance.  Look
270 into L<LWPx::ParanoidUserAgent> if you're fetching URLs given to you
271 by possibly malicious parties.
272
273 =item * NoNetwork
274
275 Optional.  Controls the interaction between the cache and HTTP
276 requests with If-Modified-Since/If-None-Match headers.  Possible
277 behaviors are:
278
279 =over
280
281 =item false (default)
282
283 If a page is in the cache, the origin HTTP server is always checked
284 for a fresher copy with an If-Modified-Since and/or If-None-Match
285 header.
286
287 =item C<1>
288
289 If set to C<1>, the origin HTTP is never contacted, regardless of the
290 page being in cache or not.  If the page is missing from cache, the
291 fetch method will return undef.  If the page is in cache, that page
292 will be returned, no matter how old it is.  Note that setting this
293 option means the L<URI::Fetch::Response> object will never have the
294 http_response member set.
295
296 =item C<N>, where N E<gt> 1
297
298 The origin HTTP server is not contacted B<if> the page is in cache
299 B<and> the cached page was inserted in the last N seconds.  If the
300 cached copy is older than N seconds, a normal HTTP request (full or
301 cache check) is done.
302
303 =back
304
305 =item * ContentAlterHook
306
307 Optional.  A subref that gets called with a scalar reference to your
308 content so you can modify the content before it's returned and before
309 it's put in cache.
310
311 For instance, you may want to only cache the E<lt>headE<gt> section of
312 an HTML document, or you may want to take a feed URL and cache only a
313 pre-parsed version of it.  If you modify the scalarref given to your
314 hook and change it into a hashref, scalarref, or some blessed object,
315 that same value will be returned to you later on not-modified
316 responses.
317
318 =item * CacheEntryGrep
319
320 Optional.  A subref that gets called with the I<URI::Fetch::Response>
321 object about to be cached (with the contents already possibly transformed by
322 your C<ContentAlterHook>).  If your subref returns true, the page goes
323 into the cache.  If false, it doesn't.
324
325 =item * Freeze
326
327 =item * Thaw
328
329 Optional. Subrefs that get called to serialize and deserialize, respectively,
330 the data that will be cached. The cached data should be assumed to be an
331 arbitrary Perl data structure, containing (potentially) references to
332 arrays, hashes, etc.
333
334 Freeze should serialize the structure into a scalar; Thaw should
335 deserialize the scalar into a data structure.
336
337 By default, I<Storable> will be used for freezing and thawing the cached
338 data structure.
339
340 =item * ForceResponse
341
342 Optional. A boolean that indicates a I<URI::Fetch::Response>
343 should be returned regardless of the HTTP status. By
344 default C<undef> is returned when a response is not a
345 "success" (200 codes) or one of the recognized HTTP status
346 codes listed above. The HTTP status message can then be retreived
347 using the C<errstr> method on the class.
348
349 =back
350
351 =head1 LICENSE
352
353 I<URI::Fetch> is free software; you may redistribute it and/or modify it
354 under the same terms as Perl itself.
355
356 =head1 AUTHOR & COPYRIGHT
357
358 Except where otherwise noted, I<URI::Fetch> is Copyright 2004 Benjamin
359 Trott, ben+cpan@stupidfool.org. All rights reserved.
360
361 =cut
362
Note: See TracBrowser for help on using the browser.