!/usr/bin/perl #------------------------------------------------------------------------ # spamtable # # J.E.Mortis (mortis@ucalgary.ca) # 2000-11-23 # # This Perl program pretty-prints the contents of the Mozilla training.dat # file, which contains the word statistics collected for Bayesian spam # filtering. # #------------------------------------------------------------------------- my (%good, %bad); my($debug) = 0; read(STDIN, $feedface, 4); read(STDIN, $gmsgcount, 4); $gmsgcount = unpack("I",$gmsgcount); read(STDIN, $bmsgcount, 4); $bmsgcount = unpack("I",$bmsgcount); read(STDIN, $gtokcount, 4); $gtokcount = unpack("I",$gtokcount); print "Good message count: $gmsgcount\n" if $debug; print "Bad message count: $bmsgcount\n" if $debug; print "Good token count: $gtokcount\n" if $debug; for ($i = 0; $i < $gtokcount; $i++) { read(STDIN, $tokcount, 4); $tokcount = unpack("I",$tokcount); read(STDIN, $toklength, 4); $toklength = unpack("I",$toklength); read(STDIN, $token, $toklength); print "good\t$tokcount\t$token\n" if $debug; $good{$token} = $tokcount; } read(STDIN, $btokcount, 4); $btokcount = unpack("I",$btokcount); print "Bad token count: $btokcount\n"; for ($i = 0; $i < $btokcount; $i++) { read(STDIN, $tokcount, 4); $tokcount = unpack("I",$tokcount); read(STDIN, $toklength, 4); $toklength = unpack("I",$toklength); read(STDIN, $token, $toklength); print "bad\t$tokcount\t$token\n" if $debug; $bad{$token} = $tokcount; } foreach $t (keys(%good)) { if (($bad{$t} + $good{$t}) > 5) { if ($bad{$t}) { $odds = $bad{$t} / ($bad{$t} + $good{$t}); print sprintf("%6.4f",$odds), "\t$t\n"; } else { print "0.0000\t$t\n"; } } } foreach $t (keys(%bad)) { if (($bad{$t} + $good{$t}) > 5) { if (! $good{$t}) { print "1.0000\t$t\n"; } } }