| 1 |
package Algorithm::NaiveBayes; |
|---|
| 2 |
|
|---|
| 3 |
$VERSION = '0.03'; |
|---|
| 4 |
use strict; |
|---|
| 5 |
|
|---|
| 6 |
sub new { |
|---|
| 7 |
my $package = shift; |
|---|
| 8 |
my $self = { |
|---|
| 9 |
purge => 1, |
|---|
| 10 |
model_type => 'Frequency', |
|---|
| 11 |
@_, |
|---|
| 12 |
instances => 0, |
|---|
| 13 |
training_data => {}, |
|---|
| 14 |
}; |
|---|
| 15 |
|
|---|
| 16 |
if ($package eq __PACKAGE__) { |
|---|
| 17 |
|
|---|
| 18 |
die "model_class cannot be set to " . __PACKAGE__ if ($self->{model_class}||'') eq __PACKAGE__; |
|---|
| 19 |
$package = $self->{model_class} || __PACKAGE__ . "::Model::" . $self->{model_type}; |
|---|
| 20 |
eval "require $package" unless $package->can('new'); |
|---|
| 21 |
return $package->new(@_); |
|---|
| 22 |
} |
|---|
| 23 |
|
|---|
| 24 |
return bless $self, $package; |
|---|
| 25 |
} |
|---|
| 26 |
|
|---|
| 27 |
sub add_instance { |
|---|
| 28 |
my ($self, %params) = @_; |
|---|
| 29 |
for ('attributes', 'label') { |
|---|
| 30 |
die "Missing required '$_' parameter" unless exists $params{$_}; |
|---|
| 31 |
} |
|---|
| 32 |
for ($params{label}) { |
|---|
| 33 |
$_ = [$_] unless ref; |
|---|
| 34 |
@{$self->{labels}}{@$_} = (); |
|---|
| 35 |
} |
|---|
| 36 |
|
|---|
| 37 |
$self->{instances}++; |
|---|
| 38 |
$self->do_add_instance($params{attributes}, $params{label}, $self->{training_data}); |
|---|
| 39 |
} |
|---|
| 40 |
|
|---|
| 41 |
sub labels { keys %{ $_[0]->{labels} } } |
|---|
| 42 |
sub instances { $_[0]->{instances} } |
|---|
| 43 |
sub training_data { $_[0]->{training_data} } |
|---|
| 44 |
|
|---|
| 45 |
sub train { |
|---|
| 46 |
my $self = shift; |
|---|
| 47 |
$self->{model} = $self->do_train($self->{training_data}); |
|---|
| 48 |
$self->do_purge if $self->purge; |
|---|
| 49 |
} |
|---|
| 50 |
|
|---|
| 51 |
sub do_purge { |
|---|
| 52 |
my $self = shift; |
|---|
| 53 |
delete $self->{training_data}; |
|---|
| 54 |
} |
|---|
| 55 |
|
|---|
| 56 |
sub purge { |
|---|
| 57 |
my $self = shift; |
|---|
| 58 |
$self->{purge} = shift if @_; |
|---|
| 59 |
return $self->{purge}; |
|---|
| 60 |
} |
|---|
| 61 |
|
|---|
| 62 |
sub predict { |
|---|
| 63 |
my ($self, %params) = @_; |
|---|
| 64 |
my $newattrs = $params{attributes} or die "Missing 'attributes' parameter for predict()"; |
|---|
| 65 |
return $self->do_predict($self->{model}, $newattrs); |
|---|
| 66 |
} |
|---|
| 67 |
|
|---|
| 68 |
1; |
|---|
| 69 |
__END__ |
|---|
| 70 |
# Below is stub documentation for your module. You better edit it! |
|---|
| 71 |
|
|---|
| 72 |
=head1 NAME |
|---|
| 73 |
|
|---|
| 74 |
Algorithm::NaiveBayes - Bayesian prediction of categories |
|---|
| 75 |
|
|---|
| 76 |
=head1 SYNOPSIS |
|---|
| 77 |
|
|---|
| 78 |
use Algorithm::NaiveBayes; |
|---|
| 79 |
my $nb = Algorithm::NaiveBayes->new; |
|---|
| 80 |
|
|---|
| 81 |
$nb->add_instance |
|---|
| 82 |
(attributes => {foo => 1, bar => 1, baz => 3}, |
|---|
| 83 |
label => 'sports'); |
|---|
| 84 |
|
|---|
| 85 |
$nb->add_instance |
|---|
| 86 |
(attributes => {foo => 2, blurp => 1}, |
|---|
| 87 |
label => ['sports', 'finance']); |
|---|
| 88 |
|
|---|
| 89 |
... repeat for several more instances, then: |
|---|
| 90 |
$nb->train; |
|---|
| 91 |
|
|---|
| 92 |
# Find results for unseen instances |
|---|
| 93 |
my $result = $nb->predict |
|---|
| 94 |
(attributes => {bar => 3, blurp => 2}); |
|---|
| 95 |
|
|---|
| 96 |
|
|---|
| 97 |
=head1 DESCRIPTION |
|---|
| 98 |
|
|---|
| 99 |
This module implements the classic "Naive Bayes" machine learning |
|---|
| 100 |
algorithm. It is a well-studied probabilistic algorithm often used in |
|---|
| 101 |
automatic text categorization. Compared to other algorithms (kNN, |
|---|
| 102 |
SVM, Decision Trees), it's pretty fast and reasonably competitive in |
|---|
| 103 |
the quality of its results. |
|---|
| 104 |
|
|---|
| 105 |
A paper by Fabrizio Sebastiani provides a really good introduction to |
|---|
| 106 |
text categorization: |
|---|
| 107 |
L<http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf> |
|---|
| 108 |
|
|---|
| 109 |
=head1 METHODS |
|---|
| 110 |
|
|---|
| 111 |
=over 4 |
|---|
| 112 |
|
|---|
| 113 |
=item new() |
|---|
| 114 |
|
|---|
| 115 |
Creates a new C<Algorithm::NaiveBayes> object and returns it. The |
|---|
| 116 |
following parameters are accepted: |
|---|
| 117 |
|
|---|
| 118 |
=over 4 |
|---|
| 119 |
|
|---|
| 120 |
=item purge |
|---|
| 121 |
|
|---|
| 122 |
If set to a true value, the C<do_purge()> method will be invoked during |
|---|
| 123 |
C<train()>. The default is true. Set this to a false value if you'd |
|---|
| 124 |
like to be able to add additional instances after training and then |
|---|
| 125 |
call C<train()> again. |
|---|
| 126 |
|
|---|
| 127 |
=back |
|---|
| 128 |
|
|---|
| 129 |
=item add_instance( attributes =E<gt> HASH, label =E<gt> STRING|ARRAY ) |
|---|
| 130 |
|
|---|
| 131 |
Adds a training instance to the categorizer. The C<attributes> |
|---|
| 132 |
parameter contains a hash reference whose keys are string attributes |
|---|
| 133 |
and whose values are the weights of those attributes. For instance, |
|---|
| 134 |
if you're categorizing text documents, the attributes might be the |
|---|
| 135 |
words of the document, and the weights might be the number of times |
|---|
| 136 |
each word occurs in the document. |
|---|
| 137 |
|
|---|
| 138 |
The C<label> parameter can contain a single string or an array of |
|---|
| 139 |
strings, with each string representing a label for this instance. The |
|---|
| 140 |
labels can be any arbitrary strings. To indicate that a document has no |
|---|
| 141 |
applicable labels, pass an empty array reference. |
|---|
| 142 |
|
|---|
| 143 |
=item train() |
|---|
| 144 |
|
|---|
| 145 |
Calculates the probabilities that will be necessary for categorization |
|---|
| 146 |
using the C<predict()> method. |
|---|
| 147 |
|
|---|
| 148 |
=item predict( attributes =E<gt> HASH ) |
|---|
| 149 |
|
|---|
| 150 |
Use this method to predict the label of an unknown instance. The |
|---|
| 151 |
attributes should be of the same format as you passed to |
|---|
| 152 |
C<add_instance()>. C<predict()> returns a hash reference whose keys |
|---|
| 153 |
are the names of labels, and whose values are the score for each |
|---|
| 154 |
label. Scores are between 0 and 1, where 0 means the label doesn't |
|---|
| 155 |
seem to apply to this instance, and 1 means it does. |
|---|
| 156 |
|
|---|
| 157 |
In practice, scores using Naive Bayes tend to be very close to 0 or 1 |
|---|
| 158 |
because of the way normalization is performed. I might try to |
|---|
| 159 |
alleviate this in future versions of the code. |
|---|
| 160 |
|
|---|
| 161 |
=item labels() |
|---|
| 162 |
|
|---|
| 163 |
Returns a list of all the labels the object knows about (in no |
|---|
| 164 |
particular order), or the number of labels if called in a scalar |
|---|
| 165 |
context. |
|---|
| 166 |
|
|---|
| 167 |
=item do_purge() |
|---|
| 168 |
|
|---|
| 169 |
Purges training instances and their associated information from the |
|---|
| 170 |
NaiveBayes object. This can save memory after training. |
|---|
| 171 |
|
|---|
| 172 |
=item purge() |
|---|
| 173 |
|
|---|
| 174 |
Returns true or false depending on the value of the object's C<purge> |
|---|
| 175 |
property. An optional boolean argument sets the property. |
|---|
| 176 |
|
|---|
| 177 |
=back |
|---|
| 178 |
|
|---|
| 179 |
=head1 THEORY |
|---|
| 180 |
|
|---|
| 181 |
Bayes' Theorem is a way of inverting a conditional probability. It |
|---|
| 182 |
states: |
|---|
| 183 |
|
|---|
| 184 |
P(y|x) P(x) |
|---|
| 185 |
P(x|y) = ------------- |
|---|
| 186 |
P(y) |
|---|
| 187 |
|
|---|
| 188 |
The notation C<P(x|y)> means "the probability of C<x> given C<y>." See also |
|---|
| 189 |
L<"http://mathforum.org/dr.math/problems/battisfore.03.22.99.html"> |
|---|
| 190 |
for a simple but complete example of Bayes' Theorem. |
|---|
| 191 |
|
|---|
| 192 |
In this case, we want to know the probability of a given category given a |
|---|
| 193 |
certain string of words in a document, so we have: |
|---|
| 194 |
|
|---|
| 195 |
P(words | cat) P(cat) |
|---|
| 196 |
P(cat | words) = -------------------- |
|---|
| 197 |
P(words) |
|---|
| 198 |
|
|---|
| 199 |
We have applied Bayes' Theorem because C<P(cat | words)> is a difficult |
|---|
| 200 |
quantity to compute directly, but C<P(words | cat)> and C<P(cat)> are accessible |
|---|
| 201 |
(see below). |
|---|
| 202 |
|
|---|
| 203 |
The greater the expression above, the greater the probability that the given |
|---|
| 204 |
document belongs to the given category. So we want to find the maximum |
|---|
| 205 |
value. We write this as |
|---|
| 206 |
|
|---|
| 207 |
P(words | cat) P(cat) |
|---|
| 208 |
Best category = ArgMax ----------------------- |
|---|
| 209 |
cat in cats P(words) |
|---|
| 210 |
|
|---|
| 211 |
|
|---|
| 212 |
Since C<P(words)> doesn't change over the range of categories, we can get rid |
|---|
| 213 |
of it. That's good, because we didn't want to have to compute these values |
|---|
| 214 |
anyway. So our new formula is: |
|---|
| 215 |
|
|---|
| 216 |
Best category = ArgMax P(words | cat) P(cat) |
|---|
| 217 |
cat in cats |
|---|
| 218 |
|
|---|
| 219 |
Finally, we note that if C<w1, w2, ... wn> are the words in the document, |
|---|
| 220 |
then this expression is equivalent to: |
|---|
| 221 |
|
|---|
| 222 |
Best category = ArgMax P(w1|cat)*P(w2|cat)*...*P(wn|cat)*P(cat) |
|---|
| 223 |
cat in cats |
|---|
| 224 |
|
|---|
| 225 |
That's the formula I use in my document categorization code. The last |
|---|
| 226 |
step is the only non-rigorous one in the derivation, and this is the |
|---|
| 227 |
"naive" part of the Naive Bayes technique. It assumes that the |
|---|
| 228 |
probability of each word appearing in a document is unaffected by the |
|---|
| 229 |
presence or absence of each other word in the document. We assume |
|---|
| 230 |
this even though we know this isn't true: for example, the word |
|---|
| 231 |
"iodized" is far more likely to appear in a document that contains the |
|---|
| 232 |
word "salt" than it is to appear in a document that contains the word |
|---|
| 233 |
"subroutine". Luckily, as it turns out, making this assumption even |
|---|
| 234 |
when it isn't true may have little effect on our results, as the |
|---|
| 235 |
following paper by Pedro Domingos argues: |
|---|
| 236 |
L<"http://www.cs.washington.edu/homes/pedrod/mlj97.ps.gz"> |
|---|
| 237 |
|
|---|
| 238 |
|
|---|
| 239 |
=head1 HISTORY |
|---|
| 240 |
|
|---|
| 241 |
My first implementation of a Naive Bayes algorithm was in the |
|---|
| 242 |
now-obsolete AI::Categorize module, first released in May 2001. I |
|---|
| 243 |
replaced it with the Naive Bayes implementation in AI::Categorizer |
|---|
| 244 |
(note the extra 'r'), first released in July 2002. I then extracted |
|---|
| 245 |
that implementation into its own module that could be used outside the |
|---|
| 246 |
framework, and that's what you see here. |
|---|
| 247 |
|
|---|
| 248 |
=head1 AUTHOR |
|---|
| 249 |
|
|---|
| 250 |
Ken Williams, ken@mathforum.org |
|---|
| 251 |
|
|---|
| 252 |
=head1 COPYRIGHT |
|---|
| 253 |
|
|---|
| 254 |
Copyright 2003-2004 Ken Williams. All rights reserved. |
|---|
| 255 |
|
|---|
| 256 |
This library is free software; you can redistribute it and/or |
|---|
| 257 |
modify it under the same terms as Perl itself. |
|---|
| 258 |
|
|---|
| 259 |
|
|---|
| 260 |
=head1 SEE ALSO |
|---|
| 261 |
|
|---|
| 262 |
AI::Categorizer(3), L<perl>. |
|---|
| 263 |
|
|---|
| 264 |
=cut |
|---|
| 265 |
|
|---|