=head1 NAME CollectTokens - collects bayes tokens in sql database. =head1 SYNOPSIS loadplugin Mail::SpamAssassin::Plugin::CollectTokens /usr/local/etc/mail/spamassassin/plugins/CollectTokens.pm collecttokens_sql_dsn DBI:mysql:dbname:localhost collecttokens_sql_username user collecttokens_sql_password pass collecttokens_sql_delayed yes|no =head1 DESCRIPTION This module collects tokens from bayes in a database indexed by hash value, making it possible to see what tokens the bayes database contains. =head2 Top 10 ham tokens SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken FROM bayes_rawtoken,bayes_token WHERE (bayes_rawtoken.token=bayes_token.token) ORDER BY bayes_token.ham_count/bayes_token.spam_count DESC LIMIT 10; SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken FROM bayes_rawtoken,bayes_token WHERE bayes_rawtoken.token=bayes_token.token ORDER BY bayes_token.ham_count DESC LIMIT 10; =head2 Top 10 spam tokens SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken FROM bayes_rawtoken,bayes_token WHERE (bayes_rawtoken.token=bayes_token.token) ORDER BY bayes_token.spam_count/bayes_token.ham_count DESC LIMIT 10; SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken FROM bayes_rawtoken,bayes_token WHERE bayes_rawtoken.token=bayes_token.token ORDER BY bayes_token.spam_count DESC LIMIT 10; =head2 List by spam probability as calculated by Mail::SpamAssassin::Bayes? Here would be a good place to put SELECT statements that uses the same algorithm as SpamAssassin for calculating spam probability. Since I'm not good enough at SQL to do that, I'll leave this up to you. =head1 REQUIREMENT The plugin requires a database with a table similar to this: CREATE TABLE bayes_rawtoken ( token BINARY(5) NOT NULL, rawtoken VARCHAR(255) NOT NULL DEFAULT '', atime INT(11) NOT NULL DEFAULT 0 ); CREATE UNIQUE INDEX brtidx ON bayes_rawtoken (token,rawtoken); CREATE INDEX brttim ON bayes_rawtoken (atime); =head1 CONFIGURATION The configuration is done in SpamAssassin site config. =over =item collecttokens_sql_dsn Wich database driver and database to use. =item collecttokens_sql_username User name for the database connection. =item collecttokens_sql_password Password for the database connection. =item collecttokens_sql_delayed Tells the module to use delayed manipulations with MySQL to speed things up. =back =head1 FUTURE It might be a good idea to put in more server specific statements in the _sql_connect method. It could also be a good idea to replace use an INSERT with fallback to an UPDATE of atime rather than the current replace based variants. A setting that makes this use a tied hash instead if SQL could be useful for some people. I won't do those things, but I'm willing to test code for this submitted by others. =cut package Mail::SpamAssassin::Plugin::CollectTokens; # $Id: CollectTokens.pm,v 1.13 2010/01/15 14:21:55 jonas Exp $ use strict; use base 'Mail::SpamAssassin::Plugin'; use DBI; use constant ID => 0; sub dbg { Mail::SpamAssassin::Plugin::dbg(@_); } sub new { my ($class,$mailsa) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsa); bless($self,$class); $self->{sqldb} = undef; $self->{main}->{conf}->{collecttokens_sql_dsn} = 'DBI:mysql:mdf:localhost'; $self->{main}->{conf}->{collecttokens_sql_username} = 'sa'; $self->{main}->{conf}->{collecttokens_sql_password} = 'pwd'; $self->{main}->{conf}->{collecttokens_sql_password} = 1; $self->{main}->{conf}->{collecttokens_sql_delayed} = 0; return $self; } sub parse_config { my ($self,$pars) = @_; return 0 if ($pars->{user_config}); return 0 unless ($pars->{key} =~ /^collecttokens_(\S+)$/); my $key = $1; return 0 unless ($key =~ /^(?:sql_dsn|sql_username|sql_password|sql_delayed|sql_transactions)$/); my $val = $pars->{value}; $val = '' if ($key =~ /(username|password)/); $val = " = $val" if ($val); $self->_dbg('config %s%s',$key,$val); $self->{main}->{conf}->{$pars->{key}} = $pars->{value}; $self->inhibit_further_callbacks(); return 1; } sub _yes_or_no { my ($def,$str) = @_; return $def unless (defined($str)); return 0 if ($str =~ /^\s*(?:n|no|off|false|0+)\s*$/i); return 1 if ($str =~ /^\s*(?:y|yes|on|true|\d+)\s*$/i); return $def; } sub _dbg { my $self = shift; my $msg = "collecttokens: ".sprintf(shift,@_); dbg($msg); } sub _sql_begin { my $self = shift; return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans}); return 1 unless ($self->{sqldb}->{AutoCommit}); return $self->{sqldb}->begin_work; } sub _sql_commit { my $self = shift; return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans}); return $self->{sqldb}->commit; } sub _sql_rollback { my $self = shift; return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans}); $self->{sqldb}->rollback; } sub _sql_command { my $self = shift; for (my $i=0;$i<@_;$i++) { my $cmd = $_[$i]; #$self->_dbg('sql %s',$cmd); unless ($cmd && defined($self->{sqldb}->do($cmd))) { $self->_dbg('sql cmd %s',$cmd); return 0; } } return 1; } sub _sql_prepare { my $self = shift; $self->{sqldb}->prepare_cached(@_); } sub _sql_connect { my ($self) = @_; return 1 if ($self->{sqldb}); #$self->_dbg('sql connect'); $self->{main}->{conf}->{_collecttokens_sql_trans} = 0; $self->{main}->{conf}->{_collecttokens_sql_delayed} = 0; $self->{sqldb} = DBI->connect_cached( $self->{main}->{conf}->{collecttokens_sql_dsn}, $self->{main}->{conf}->{collecttokens_sql_username}, $self->{main}->{conf}->{collecttokens_sql_password}, {RaiseError=>0} ); unless ($self->{sqldb}) { $self->_dbg('sql connect failed'); return 0; } $self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT OR REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)'; if ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /sqlite/i) { sql_command_i('PRAGMA SYNCHRONOUS=OFF'); $self->{main}->{conf}->{_collecttokens_sql_trans} = 1; #sql_command('PRAGMA COUNT CHANGES=0'); } elsif ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /mysql/i) { if (_yes_or_no(0,$self->{main}->{conf}->{collecttokens_sql_delayed})) { $self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE DELAYED INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)'; } else { #$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)'; $self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?) ON DUPLICATE KEY UPDATE atime=VALUES(atime)'; } } $self->{main}->{conf}->{_collecttokens_sql_trans} = _yes_or_no($self->{main}->{conf}->{_collecttokens_sql_trans},$self->{main}->{conf}->{collecttokens_sql_transactions}); return 1; } sub _sql_disconnect { my ($self) = @_; if ($self->{sqldb}) { #$self->_dbg('sql disconnect'); $self->{sqldb}->disconnect(); } $self->{sqldb} = undef; } sub _sql_quote { my ($self,$s) = @_; return $self->{sqldb}->quote($s); } sub bayes_learn { my $self = shift; my $info = shift; return unless $info->{toksref}; return unless ($info->{toksref} && %{$info->{toksref}}); return unless ($self->_sql_connect); my $sth = $self->_sql_prepare($self->{main}->{conf}->{_collecttokens_sql_stmt}); if ($sth) { if ($self->_sql_begin) { my $now = time(); my $cc = 0; my @err = (); foreach my $token (keys %{$info->{toksref}}) { if ($sth->execute($token,$info->{toksref}->{$token},$now)) { $cc ++; } else { push @err, $token; } } my $ok = $self->_sql_commit; if ($ok) { foreach my $token (@err) { $ok = $self->_sql_begin; last unless ($ok); if ($sth->execute($token,$info->{toksref}->{$token},$now)) { $ok = $self->_sql_commit; $cc ++ if ($ok); } else { $ok = $self->_sql_rollback; } last unless ($ok); } } $self->_dbg('collected %u tokens',$cc) if ($cc); $self->_dbg('SQL ERROR!') unless ($ok); } $sth->finish; } $self->_sql_disconnect; } 1;