root/feedmelinks/bin/bayesian-learner.pl

Revision 1308, 1.2 kB (checked in by jm3, 2 years ago)
  • heuristics generator: takes a category ("SPAMMERS" or "USERS") and a list of usernamers, and generates 10 heuristics for each user. outputs XML.
  • bayesian learner: readers SPAMMERS.xml, USERS.xml, uses Ken Williams' Naive-Bayesian algorithm in perl to train a bayesian classifier which then categorizes the users in UNKNOWN.xml
  • libs checked in too
  • Property svn:executable set to *
Line 
1 #!/usr/local/bin/perl
2 use XML::Simple;
3 use Data::Dumper;
4 use Algorithm::NaiveBayes;
5 use POSIX;
6
7 my $learner = Algorithm::NaiveBayes->new;
8
9 @datasets = ( "USERS.xml", "SPAMMERS.xml" );
10 foreach $dataset ( @datasets ) {
11
12         $xml = new XML::Simple( KeyAttr=>[] );
13         $data = $xml->XMLin( $dataset, KeepRoot => 1);
14
15         $trueset_classification =  $data->{user_group}->{type};
16         print "Training the learner on *KNOWN* $trueset_classification: ";
17
18         my $c = 0;
19         foreach $u (@{$data->{user_group}->{user}}) {
20                 $c++;
21                 print "." if( $c % 5 == 0 ); # status dot
22                 $learner->add_instance( attributes => $u->{heuristics}, label => $trueset_classification );
23         }
24         print "\n\n";
25 }
26
27 print "Training complete. Now let's test the learner on some users who may or may not be spammers:\n\n";
28 $learner->train; # get smart!
29
30 $xml = new XML::Simple( KeyAttr=>[] );
31 $unknowns = $xml->XMLin( "UNKNOWN.xml", KeepRoot => 1);
32 foreach $u (@{$unknowns->{user_group}->{user}}) {
33         my $result = $learner->predict(attributes => $u->{heuristics});
34         print "  " . $u->{name} . " - probability of being in the group of: \n";
35         while( my ($k, $v) = each %$result ) {
36                 print "    $k: " . POSIX::floor(100 * $v ) . "%\n";
37         }
38         print "\n";
39 }
40
Note: See TracBrowser for help on using the browser.