=head1 NAME
CollectTokens - collects bayes tokens in sql database.
=head1 SYNOPSIS
loadplugin Mail::SpamAssassin::Plugin::CollectTokens /usr/local/etc/mail/spamassassin/plugins/CollectTokens.pm
collecttokens_sql_dsn DBI:mysql:dbname:localhost
collecttokens_sql_username user
collecttokens_sql_password pass
collecttokens_sql_delayed yes|no
=head1 DESCRIPTION
This module collects tokens from bayes in a database indexed by hash value,
making it possible to see what tokens the bayes database contains.
=head2 Top 10 ham tokens
SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken
FROM bayes_rawtoken,bayes_token
WHERE (bayes_rawtoken.token=bayes_token.token)
ORDER BY bayes_token.ham_count/bayes_token.spam_count DESC LIMIT 10;
SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken
FROM bayes_rawtoken,bayes_token
WHERE bayes_rawtoken.token=bayes_token.token
ORDER BY bayes_token.ham_count DESC LIMIT 10;
=head2 Top 10 spam tokens
SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken
FROM bayes_rawtoken,bayes_token
WHERE (bayes_rawtoken.token=bayes_token.token)
ORDER BY bayes_token.spam_count/bayes_token.ham_count DESC LIMIT 10;
SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken
FROM bayes_rawtoken,bayes_token
WHERE bayes_rawtoken.token=bayes_token.token
ORDER BY bayes_token.spam_count DESC LIMIT 10;
=head2 List by spam probability as calculated by Mail::SpamAssassin::Bayes?
Here would be a good place to put SELECT statements that uses the same
algorithm as SpamAssassin for calculating spam probability.
Since I'm not good enough at SQL to do that, I'll leave this up to you.
=head1 REQUIREMENT
The plugin requires a database with a table similar to this:
CREATE TABLE bayes_rawtoken (
token BINARY(5) NOT NULL,
rawtoken VARCHAR(255) NOT NULL DEFAULT '',
atime INT(11) NOT NULL DEFAULT 0
);
CREATE UNIQUE INDEX brtidx ON bayes_rawtoken (token,rawtoken);
=head1 CONFIGURATION
The configuration is done in SpamAssassin site config.
=over
=item collecttokens_sql_dsn
Wich database driver and database to use.
=item collecttokens_sql_username
User name for the database connection.
=item collecttokens_sql_password
Password for the database connection.
=item collecttokens_sql_delayed
Tells the module to use delayed manipulations with MySQL to speed things up.
=back
=head1 FUTURE
It might be a good idea to put in more server specific statements in the
_sql_connect method.
It could also be a good idea to replace use an INSERT with fallback to
an UPDATE of atime rather than the current replace based variants.
A setting that makes this use a tied hash instead if SQL could be
useful for some people.
I won't do those things, but I'm willing to test code for this submitted by
others.
=cut
package Mail::SpamAssassin::Plugin::CollectTokens;
# $Id: CollectTokens.pm,v 1.11 2009/06/26 11:52:16 jonas Exp $
use strict;
use base 'Mail::SpamAssassin::Plugin';
use DBI;
use constant ID => 0;
sub dbg {
Mail::SpamAssassin::Plugin::dbg(@_);
}
sub new {
my ($class,$mailsa) = @_;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsa);
bless($self,$class);
$self->{sqldb} = undef;
$self->{main}->{conf}->{collecttokens_sql_dsn} = 'DBI:mysql:mdf:localhost';
$self->{main}->{conf}->{collecttokens_sql_username} = 'sa';
$self->{main}->{conf}->{collecttokens_sql_password} = 'pwd';
$self->{main}->{conf}->{collecttokens_sql_password} = 1;
$self->{main}->{conf}->{collecttokens_sql_delayed} = 0;
return $self;
}
sub parse_config {
my ($self,$pars) = @_;
return 0 if ($pars->{user_config});
return 0 unless ($pars->{key} =~ /^collecttokens_(\S+)$/);
my $key = $1;
return 0 unless ($key =~ /^(?:sql_dsn|sql_username|sql_password|sql_delayed|sql_transactions)$/);
my $val = $pars->{value};
$val = '' if ($key =~ /(username|password)/);
$val = " = $val" if ($val);
$self->_dbg('config %s%s',$key,$val);
$self->{main}->{conf}->{$pars->{key}} = $pars->{value};
$self->inhibit_further_callbacks();
return 1;
}
sub _yes_or_no {
my ($def,$str) = @_;
return $def unless (defined($str));
return 0 if ($str =~ /^\s*(?:n|no|off|false|0+)\s*$/i);
return 1 if ($str =~ /^\s*(?:y|yes|on|true|\d+)\s*$/i);
return $def;
}
sub _dbg {
my $self = shift;
my $msg = "collecttokens: ".sprintf(shift,@_);
dbg($msg);
}
sub _sql_begin {
my $self = shift;
return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
return 1 unless ($self->{sqldb}->{AutoCommit});
return $self->{sqldb}->begin_work;
}
sub _sql_commit {
my $self = shift;
return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
return $self->{sqldb}->commit;
}
sub _sql_rollback {
my $self = shift;
return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
$self->{sqldb}->rollback;
}
sub _sql_command {
my $self = shift;
for (my $i=0;$i<@_;$i++) {
my $cmd = $_[$i];
#$self->_dbg('sql %s',$cmd);
unless ($cmd && defined($self->{sqldb}->do($cmd))) {
$self->_dbg('sql cmd %s',$cmd);
return 0;
}
}
return 1;
}
sub _sql_prepare {
my $self = shift;
$self->{sqldb}->prepare_cached(@_);
}
sub _sql_connect {
my ($self) = @_;
return 1 if ($self->{sqldb});
#$self->_dbg('sql connect');
$self->{main}->{conf}->{_collecttokens_sql_trans} = 0;
$self->{main}->{conf}->{_collecttokens_sql_delayed} = 0;
$self->{sqldb} = DBI->connect_cached(
$self->{main}->{conf}->{collecttokens_sql_dsn},
$self->{main}->{conf}->{collecttokens_sql_username},
$self->{main}->{conf}->{collecttokens_sql_password},
{RaiseError=>0}
);
unless ($self->{sqldb}) {
$self->_dbg('sql connect failed');
return 0;
}
$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT OR REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
if ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /sqlite/i) {
sql_command_i('PRAGMA SYNCHRONOUS=OFF');
$self->{main}->{conf}->{_collecttokens_sql_trans} = 1;
#sql_command('PRAGMA COUNT CHANGES=0');
} elsif ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /mysql/i) {
if (_yes_or_no(0,$self->{main}->{conf}->{collecttokens_sql_delayed})) {
$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE DELAYED INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
} else {
#$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?) ON DUPLICATE KEY UPDATE atime=VALUES(atime)';
}
}
$self->{main}->{conf}->{_collecttokens_sql_trans} = _yes_or_no($self->{main}->{conf}->{_collecttokens_sql_trans},$self->{main}->{conf}->{collecttokens_sql_transactions});
return 1;
}
sub _sql_disconnect {
my ($self) = @_;
if ($self->{sqldb}) {
#$self->_dbg('sql disconnect');
$self->{sqldb}->disconnect();
}
$self->{sqldb} = undef;
}
sub _sql_quote {
my ($self,$s) = @_;
return $self->{sqldb}->quote($s);
}
sub bayes_learn {
my $self = shift;
my $info = shift;
return unless $info->{toksref};
return unless ($info->{toksref} && %{$info->{toksref}});
return unless ($self->_sql_connect);
my $sth = $self->_sql_prepare($self->{main}->{conf}->{_collecttokens_sql_stmt});
if ($sth) {
if ($self->_sql_begin) {
my $now = time();
my $ok = 1;
my $cc = 0;
foreach my $token (keys %{$info->{toksref}}) {
$ok = $sth->execute($token,$info->{toksref}->{$token},$now);
last unless ($ok);
$cc ++;
}
if ($ok) {
$self->_dbg('collected %u tokens',$cc);
$self->_sql_commit;
} else {
$self->_dbg('SQL ERROR!');
$self->_sql_rollback;
}
}
$sth->finish;
}
$self->_sql_disconnect;
}
1;
(2008-01-11)