Whatever

sa: CollectTokens.pm

=head1 NAME

CollectTokens - collects bayes tokens in sql database.

=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::CollectTokens /usr/local/etc/mail/spamassassin/plugins/CollectTokens.pm
  
	collecttokens_sql_dsn       DBI:mysql:dbname:localhost
	collecttokens_sql_username  user
	collecttokens_sql_password  pass
	collecttokens_sql_delayed   yes|no

=head1 DESCRIPTION

This module collects tokens from bayes in a database indexed by hash value,
making it possible to see what tokens the bayes database contains.

=head2 Top 10 ham tokens

	SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken 
	  FROM bayes_rawtoken,bayes_token 
	  WHERE (bayes_rawtoken.token=bayes_token.token)
	  ORDER BY bayes_token.ham_count/bayes_token.spam_count DESC LIMIT 10;

	SELECT bayes_token.ham_count,bayes_rawtoken.rawtoken 
	  FROM bayes_rawtoken,bayes_token 
	  WHERE bayes_rawtoken.token=bayes_token.token
	  ORDER BY bayes_token.ham_count DESC LIMIT 10;

=head2 Top 10 spam tokens

	SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken 
	  FROM bayes_rawtoken,bayes_token 
	  WHERE (bayes_rawtoken.token=bayes_token.token)
	  ORDER BY bayes_token.spam_count/bayes_token.ham_count DESC LIMIT 10;

	SELECT bayes_token.spam_count,bayes_rawtoken.rawtoken 
	  FROM bayes_rawtoken,bayes_token 
	  WHERE bayes_rawtoken.token=bayes_token.token
	  ORDER BY bayes_token.spam_count DESC LIMIT 10;

=head2 List by spam probability as calculated by Mail::SpamAssassin::Bayes?

Here would be a good place to put SELECT statements that uses the same
algorithm as SpamAssassin for calculating spam probability.

Since I'm not good enough at SQL to do that, I'll leave this up to you.

=head1 REQUIREMENT

The plugin requires a database with a table similar to this:

	CREATE TABLE bayes_rawtoken (
	  token BINARY(5) NOT NULL,
	  rawtoken VARCHAR(255) NOT NULL DEFAULT '',
	  atime INT(11) NOT NULL DEFAULT 0
	);
	CREATE UNIQUE INDEX brtidx ON bayes_rawtoken (token,rawtoken);

=head1 CONFIGURATION

The configuration is done in SpamAssassin site config.

=over

=item collecttokens_sql_dsn

Wich database driver and database to use.

=item collecttokens_sql_username

User name for the database connection.

=item collecttokens_sql_password

Password for the database connection.

=item collecttokens_sql_delayed

Tells the module to use delayed manipulations with MySQL to speed things up.

=back

=head1 FUTURE

It might be a good idea to put in more server specific statements in the
_sql_connect method.

It could also be a good idea to replace use an INSERT with fallback to
an UPDATE of atime rather than the current replace based variants.

A setting that makes this use a tied hash instead if SQL could be
useful for some people.

I won't do those things, but I'm willing to test code for this submitted by
others.

=cut

package Mail::SpamAssassin::Plugin::CollectTokens;

# $Id: CollectTokens.pm,v 1.11 2009/06/26 11:52:16 jonas Exp $

use strict;
use base 'Mail::SpamAssassin::Plugin';
use DBI;

use constant ID => 0;

sub dbg { 
	Mail::SpamAssassin::Plugin::dbg(@_);
}

sub new {
	my ($class,$mailsa) = @_;
	$class = ref($class) || $class;
	my $self = $class->SUPER::new($mailsa);
	bless($self,$class);
	$self->{sqldb} = undef;
	$self->{main}->{conf}->{collecttokens_sql_dsn} = 'DBI:mysql:mdf:localhost';
	$self->{main}->{conf}->{collecttokens_sql_username} = 'sa';
	$self->{main}->{conf}->{collecttokens_sql_password} = 'pwd';
	$self->{main}->{conf}->{collecttokens_sql_password} = 1;
	$self->{main}->{conf}->{collecttokens_sql_delayed} = 0;
	return $self;
}

sub parse_config {
	my ($self,$pars) = @_;
	return 0 if ($pars->{user_config});
	return 0 unless ($pars->{key} =~ /^collecttokens_(\S+)$/);
	my $key = $1;
	return 0 unless ($key =~ /^(?:sql_dsn|sql_username|sql_password|sql_delayed|sql_transactions)$/);
	my $val = $pars->{value};
	$val = '' if ($key =~ /(username|password)/);
	$val = " = $val" if ($val);
	$self->_dbg('config %s%s',$key,$val);
	$self->{main}->{conf}->{$pars->{key}} = $pars->{value};
	$self->inhibit_further_callbacks();
	return 1;
}

sub _yes_or_no {
	my ($def,$str) = @_;
	return $def unless (defined($str));
	return 0 if ($str =~ /^\s*(?:n|no|off|false|0+)\s*$/i);
	return 1 if ($str =~ /^\s*(?:y|yes|on|true|\d+)\s*$/i);
	return $def;
}

sub _dbg {
	my $self = shift;
	my $msg = "collecttokens: ".sprintf(shift,@_);
	dbg($msg);
}

sub _sql_begin {
	my $self = shift;
	return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
	return 1 unless ($self->{sqldb}->{AutoCommit});
	return $self->{sqldb}->begin_work;
}
sub _sql_commit {
	my $self = shift;
	return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
	return $self->{sqldb}->commit;
}
sub _sql_rollback {
	my $self = shift;
	return 1 unless ($self->{main}->{conf}->{_collecttokens_sql_trans});
	$self->{sqldb}->rollback;
}

sub _sql_command {
	my $self = shift;
	for (my $i=0;$i<@_;$i++) {
		my $cmd = $_[$i];
		#$self->_dbg('sql %s',$cmd);
		unless ($cmd && defined($self->{sqldb}->do($cmd))) {
			$self->_dbg('sql cmd %s',$cmd);
			return 0;
		}
	}
	return 1;
}
sub _sql_prepare {
	my $self = shift;
	$self->{sqldb}->prepare_cached(@_);
}

sub _sql_connect {
	my ($self) = @_;
	return 1 if ($self->{sqldb});
	#$self->_dbg('sql connect');
	$self->{main}->{conf}->{_collecttokens_sql_trans} = 0;
	$self->{main}->{conf}->{_collecttokens_sql_delayed} = 0;
	$self->{sqldb} = DBI->connect_cached(
				$self->{main}->{conf}->{collecttokens_sql_dsn},
				$self->{main}->{conf}->{collecttokens_sql_username},
				$self->{main}->{conf}->{collecttokens_sql_password},
				{RaiseError=>0}
	);
	unless ($self->{sqldb}) {
		$self->_dbg('sql connect failed');
		return 0;
	}
	$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT OR REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
	if ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /sqlite/i) {
		sql_command_i('PRAGMA SYNCHRONOUS=OFF');
		$self->{main}->{conf}->{_collecttokens_sql_trans} = 1;
		#sql_command('PRAGMA COUNT CHANGES=0');
	} elsif ($self->{main}->{conf}->{collecttokens_sql_dsn} =~ /mysql/i) {
		if (_yes_or_no(0,$self->{main}->{conf}->{collecttokens_sql_delayed})) {
			$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE DELAYED INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
		} else {
			#$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'REPLACE INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?)';
			$self->{main}->{conf}->{_collecttokens_sql_stmt} = 'INSERT INTO bayes_rawtoken (token,rawtoken,atime) VALUES (?,?,?) ON DUPLICATE KEY UPDATE atime=VALUES(atime)';
		}
	}
	$self->{main}->{conf}->{_collecttokens_sql_trans} = _yes_or_no($self->{main}->{conf}->{_collecttokens_sql_trans},$self->{main}->{conf}->{collecttokens_sql_transactions});
	return 1;
}

sub _sql_disconnect {
	my ($self) = @_;
	if ($self->{sqldb}) {
		#$self->_dbg('sql disconnect');
		$self->{sqldb}->disconnect();
	}
	$self->{sqldb} = undef;
}

sub _sql_quote {
	my ($self,$s) = @_;
	return $self->{sqldb}->quote($s);
}


sub bayes_learn {
	my $self = shift;
	my $info = shift;
	return unless $info->{toksref};
	return unless ($info->{toksref} && %{$info->{toksref}});
	return unless ($self->_sql_connect);
	my $sth = $self->_sql_prepare($self->{main}->{conf}->{_collecttokens_sql_stmt});
	if ($sth) {
		if ($self->_sql_begin) {
			my $now = time();
			my $ok = 1;
			my $cc = 0;
			foreach my $token (keys %{$info->{toksref}}) {
				$ok = $sth->execute($token,$info->{toksref}->{$token},$now);
				last unless ($ok);
				$cc ++;
			}
			if ($ok) {
				$self->_dbg('collected %u tokens',$cc);
				$self->_sql_commit;
			} else {
				$self->_dbg('SQL ERROR!');
				$self->_sql_rollback;
			}
		}
		$sth->finish;
	}
	$self->_sql_disconnect;
}

1;

(2008-01-11)