|
Revision 1308, 1.2 kB
(checked in by jm3, 2 years ago)
|
- heuristics generator: takes a category ("SPAMMERS" or "USERS") and a list of usernamers, and generates 10 heuristics for each user. outputs XML.
- bayesian learner: readers SPAMMERS.xml, USERS.xml, uses Ken Williams' Naive-Bayesian algorithm in perl to train a bayesian classifier which then categorizes the users in UNKNOWN.xml
- libs checked in too
|
- Property svn:executable set to
*
|
| Line | |
|---|
| 1 |
|
|---|
| 2 |
use XML::Simple; |
|---|
| 3 |
use Data::Dumper; |
|---|
| 4 |
use Algorithm::NaiveBayes; |
|---|
| 5 |
use POSIX; |
|---|
| 6 |
|
|---|
| 7 |
my $learner = Algorithm::NaiveBayes->new; |
|---|
| 8 |
|
|---|
| 9 |
@datasets = ( "USERS.xml", "SPAMMERS.xml" ); |
|---|
| 10 |
foreach $dataset ( @datasets ) { |
|---|
| 11 |
|
|---|
| 12 |
$xml = new XML::Simple( KeyAttr=>[] ); |
|---|
| 13 |
$data = $xml->XMLin( $dataset, KeepRoot => 1); |
|---|
| 14 |
|
|---|
| 15 |
$trueset_classification = $data->{user_group}->{type}; |
|---|
| 16 |
print "Training the learner on *KNOWN* $trueset_classification: "; |
|---|
| 17 |
|
|---|
| 18 |
my $c = 0; |
|---|
| 19 |
foreach $u (@{$data->{user_group}->{user}}) { |
|---|
| 20 |
$c++; |
|---|
| 21 |
print "." if( $c % 5 == 0 ); |
|---|
| 22 |
$learner->add_instance( attributes => $u->{heuristics}, label => $trueset_classification ); |
|---|
| 23 |
} |
|---|
| 24 |
print "\n\n"; |
|---|
| 25 |
} |
|---|
| 26 |
|
|---|
| 27 |
print "Training complete. Now let's test the learner on some users who may or may not be spammers:\n\n"; |
|---|
| 28 |
$learner->train; |
|---|
| 29 |
|
|---|
| 30 |
$xml = new XML::Simple( KeyAttr=>[] ); |
|---|
| 31 |
$unknowns = $xml->XMLin( "UNKNOWN.xml", KeepRoot => 1); |
|---|
| 32 |
foreach $u (@{$unknowns->{user_group}->{user}}) { |
|---|
| 33 |
my $result = $learner->predict(attributes => $u->{heuristics}); |
|---|
| 34 |
print " " . $u->{name} . " - probability of being in the group of: \n"; |
|---|
| 35 |
while( my ($k, $v) = each %$result ) { |
|---|
| 36 |
print " $k: " . POSIX::floor(100 * $v ) . "%\n"; |
|---|
| 37 |
} |
|---|
| 38 |
print "\n"; |
|---|
| 39 |
} |
|---|
| 40 |
|
|---|