| 1 |
package HTML::Strip; |
|---|
| 2 |
|
|---|
| 3 |
use 5.006; |
|---|
| 4 |
use warnings; |
|---|
| 5 |
use strict; |
|---|
| 6 |
|
|---|
| 7 |
use Carp qw( carp croak ); |
|---|
| 8 |
|
|---|
| 9 |
require Exporter; |
|---|
| 10 |
require DynaLoader; |
|---|
| 11 |
|
|---|
| 12 |
our @ISA = qw(Exporter DynaLoader); |
|---|
| 13 |
|
|---|
| 14 |
|
|---|
| 15 |
|
|---|
| 16 |
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
our %EXPORT_TAGS = ( 'all' => [ qw( |
|---|
| 22 |
) ] ); |
|---|
| 23 |
|
|---|
| 24 |
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
|---|
| 25 |
|
|---|
| 26 |
our @EXPORT = qw(); |
|---|
| 27 |
|
|---|
| 28 |
our $VERSION = '1.04'; |
|---|
| 29 |
|
|---|
| 30 |
bootstrap HTML::Strip $VERSION; |
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
|
|---|
| 34 |
my $_html_entities_p = eval 'require HTML::Entities'; |
|---|
| 35 |
|
|---|
| 36 |
my @default_striptags = qw( title |
|---|
| 37 |
style |
|---|
| 38 |
script |
|---|
| 39 |
applet ); |
|---|
| 40 |
|
|---|
| 41 |
sub new { |
|---|
| 42 |
my $class = shift; |
|---|
| 43 |
my $obj = create(); |
|---|
| 44 |
bless $obj, $class; |
|---|
| 45 |
|
|---|
| 46 |
my %args = (striptags => \@default_striptags, @_); |
|---|
| 47 |
while( my ($key, $value) = each %args ) { |
|---|
| 48 |
my $method = "set_${key}"; |
|---|
| 49 |
if( $obj->can($method) ) { |
|---|
| 50 |
$obj->$method($value); |
|---|
| 51 |
} else { |
|---|
| 52 |
carp "Invalid setting '$key'"; |
|---|
| 53 |
} |
|---|
| 54 |
} |
|---|
| 55 |
return $obj; |
|---|
| 56 |
} |
|---|
| 57 |
|
|---|
| 58 |
sub set_striptags { |
|---|
| 59 |
my ($self, @tags) = @_; |
|---|
| 60 |
if( ref($tags[0]) eq 'ARRAY' ) { |
|---|
| 61 |
$self->set_striptags_ref( $tags[0] ); |
|---|
| 62 |
} else { |
|---|
| 63 |
$self->set_striptags_ref( \@tags ); |
|---|
| 64 |
} |
|---|
| 65 |
} |
|---|
| 66 |
|
|---|
| 67 |
sub parse { |
|---|
| 68 |
my ($self, $text) = @_; |
|---|
| 69 |
my $stripped = $self->strip_html( $text ); |
|---|
| 70 |
if( $_html_entities_p ) { |
|---|
| 71 |
$stripped = HTML::Entities::decode($stripped); |
|---|
| 72 |
} |
|---|
| 73 |
return $stripped; |
|---|
| 74 |
} |
|---|
| 75 |
|
|---|
| 76 |
sub eof { |
|---|
| 77 |
my $self = shift; |
|---|
| 78 |
$self->reset(); |
|---|
| 79 |
} |
|---|
| 80 |
|
|---|
| 81 |
1; |
|---|
| 82 |
__END__ |
|---|
| 83 |
# Below is stub documentation for your module. You better edit it! |
|---|
| 84 |
|
|---|
| 85 |
=head1 NAME |
|---|
| 86 |
|
|---|
| 87 |
HTML::Strip - Perl extension for stripping HTML markup from text. |
|---|
| 88 |
|
|---|
| 89 |
=head1 SYNOPSIS |
|---|
| 90 |
|
|---|
| 91 |
use HTML::Strip; |
|---|
| 92 |
|
|---|
| 93 |
my $hs = HTML::Strip->new(); |
|---|
| 94 |
|
|---|
| 95 |
my $clean_text = $hs->parse( $raw_html ); |
|---|
| 96 |
$hs->eof; |
|---|
| 97 |
|
|---|
| 98 |
=head1 DESCRIPTION |
|---|
| 99 |
|
|---|
| 100 |
This module simply strips HTML-like markup from text in a very quick |
|---|
| 101 |
and brutal manner. It could quite easily be used to strip XML or SGML |
|---|
| 102 |
from text as well; but removing HTML markup is a much more common |
|---|
| 103 |
problem, hence this module lives in the HTML:: namespace. |
|---|
| 104 |
|
|---|
| 105 |
It is written in XS, and thus about five times quicker than using |
|---|
| 106 |
regular expressions for the same task. |
|---|
| 107 |
|
|---|
| 108 |
It does I<not> do any syntax checking (if you want that, use |
|---|
| 109 |
L<HTML::Parser>), instead it merely applies the following rules: |
|---|
| 110 |
|
|---|
| 111 |
=over 4 |
|---|
| 112 |
|
|---|
| 113 |
=item 1 |
|---|
| 114 |
|
|---|
| 115 |
Anything that looks like a tag, or group of tags will be replaced with |
|---|
| 116 |
a single space character. Tags are considered to be anything that |
|---|
| 117 |
starts with a C<E<lt>> and ends with a C<E<gt>>; with the caveat that a |
|---|
| 118 |
C<E<gt>> character may appear in either of the following without |
|---|
| 119 |
ending the tag: |
|---|
| 120 |
|
|---|
| 121 |
=over 4 |
|---|
| 122 |
|
|---|
| 123 |
=item Quote |
|---|
| 124 |
|
|---|
| 125 |
Quotes are considered to start with either a C<'> or a C<"> character, |
|---|
| 126 |
and end with a matching character I<not> preceded by an even number or |
|---|
| 127 |
escaping slashes (i.e. C<\"> does not end the quote but C<\\\\"> does). |
|---|
| 128 |
|
|---|
| 129 |
=item Comment |
|---|
| 130 |
|
|---|
| 131 |
If the tag starts with an exclamation mark, it is assumed to be a |
|---|
| 132 |
declaration or a comment. Within such tags, C<E<gt>> characters do not |
|---|
| 133 |
end the tag if they appear within pairs of double dashes (e.g. C<E<lt>!-- |
|---|
| 134 |
E<lt>a href="old.htm"E<gt>old pageE<lt>/aE<gt> --E<gt>> would be |
|---|
| 135 |
stripped completely). |
|---|
| 136 |
|
|---|
| 137 |
=back |
|---|
| 138 |
|
|---|
| 139 |
=item 2 |
|---|
| 140 |
|
|---|
| 141 |
Anything the appears within so-called I<strip tags> is stripped as |
|---|
| 142 |
well. By default, these tags are C<title>, C<script>, C<style> and |
|---|
| 143 |
C<applet>. |
|---|
| 144 |
|
|---|
| 145 |
=back |
|---|
| 146 |
|
|---|
| 147 |
HTML::Strip maintains state between calls, so you can parse a document |
|---|
| 148 |
in chunks should you wish. If one chunk ends half-way through a tag, |
|---|
| 149 |
quote, comment, or whatever; it will remember this, and expect the |
|---|
| 150 |
next call to parse to start with the remains of said tag. |
|---|
| 151 |
|
|---|
| 152 |
If this is not going to be the case, be sure to call $hs->eof() |
|---|
| 153 |
between calls to $hs->parse(). |
|---|
| 154 |
|
|---|
| 155 |
=head2 METHODS |
|---|
| 156 |
|
|---|
| 157 |
=item new() |
|---|
| 158 |
|
|---|
| 159 |
Constructor. Can optionally take a hash of settings (with keys |
|---|
| 160 |
corresponsing to the C<set_> methods below). |
|---|
| 161 |
|
|---|
| 162 |
For example, the following is a valid constructor: |
|---|
| 163 |
|
|---|
| 164 |
my $hs = HTML::Strip->new( |
|---|
| 165 |
striptags => [ 'script', 'iframe' ], |
|---|
| 166 |
emit_spaces => 0 |
|---|
| 167 |
); |
|---|
| 168 |
|
|---|
| 169 |
=item parse() |
|---|
| 170 |
|
|---|
| 171 |
Takes a string as an argument, returns it stripped of HTML. |
|---|
| 172 |
|
|---|
| 173 |
=item eof() |
|---|
| 174 |
|
|---|
| 175 |
Resets the current state information, ready to parse a new block of HTML. |
|---|
| 176 |
|
|---|
| 177 |
=item clear_striptags() |
|---|
| 178 |
|
|---|
| 179 |
Clears the current set of strip tags. |
|---|
| 180 |
|
|---|
| 181 |
=item add_striptag() |
|---|
| 182 |
|
|---|
| 183 |
Adds the string passed as an argument to the current set of strip tags. |
|---|
| 184 |
|
|---|
| 185 |
=item set_striptags() |
|---|
| 186 |
|
|---|
| 187 |
Takes a reference to an array of strings, which replace the current |
|---|
| 188 |
set of strip tags. |
|---|
| 189 |
|
|---|
| 190 |
=item set_emit_spaces() |
|---|
| 191 |
|
|---|
| 192 |
Takes a boolean value. If set to false, HTML::Strip will not attempt |
|---|
| 193 |
any conversion of tags into spaces. Set to true by default. |
|---|
| 194 |
|
|---|
| 195 |
=head2 LIMITATIONS |
|---|
| 196 |
|
|---|
| 197 |
=over 4 |
|---|
| 198 |
|
|---|
| 199 |
=item Whitespace |
|---|
| 200 |
|
|---|
| 201 |
Despite only outputting one space character per group of tags, and |
|---|
| 202 |
avoiding doing so when tags are bordered by spaces or the start or |
|---|
| 203 |
end of strings, HTML::Strip can often output more than desired; such |
|---|
| 204 |
as with the following HTML: |
|---|
| 205 |
|
|---|
| 206 |
<h1> HTML::Strip </h1> <p> <em> <strong> fast, and brutal </strong> </em> </p> |
|---|
| 207 |
|
|---|
| 208 |
Which gives the following output: |
|---|
| 209 |
|
|---|
| 210 |
C<E<nbsp>HTML::StripE<nbsp>E<nbsp>E<nbsp>E<nbsp>fast, and brutalE<nbsp>E<nbsp>E<nbsp>> |
|---|
| 211 |
|
|---|
| 212 |
Thus, you may want to post-filter the output of HTML::Strip to remove |
|---|
| 213 |
excess whitespace (for example, using C<tr/ / /s;>). |
|---|
| 214 |
(This has been improved since previous releases, but is still an issue) |
|---|
| 215 |
|
|---|
| 216 |
=item HTML Entities |
|---|
| 217 |
|
|---|
| 218 |
HTML::Strip will only attempt decoding of HTML entities if |
|---|
| 219 |
L<HTML::Entities> is installed (whereupon it will do so |
|---|
| 220 |
automatically). |
|---|
| 221 |
|
|---|
| 222 |
=head2 EXPORT |
|---|
| 223 |
|
|---|
| 224 |
None by default. |
|---|
| 225 |
|
|---|
| 226 |
=head1 AUTHOR |
|---|
| 227 |
|
|---|
| 228 |
Alex Bowley E<lt>kilinrax@cpan.orgE<gt> |
|---|
| 229 |
|
|---|
| 230 |
=head1 SEE ALSO |
|---|
| 231 |
|
|---|
| 232 |
L<perl>, L<HTML::Parser>, L<HTML::Entities> |
|---|
| 233 |
|
|---|
| 234 |
=cut |
|---|
| 235 |
|
|---|