=head1 NAME
HashCount - counts messages using nilsimsa and IxHash/NixSpam like hashes.
=head1 SYNOPSIS
loadplugin Mail::SpamAssassin::Plugin::HashCount /usr/local/etc/mail/spamassassin/plugins/HashCount.pm
hashcount_sql_dsn DBI:mysql:dbname:localhost
hashcount_sql_username user
hashcount_sql_password pass
hashcount_add after,spam
hashcount_stream mail
describe HASHCOUNT_20 More than 20 copies of message
header HASHCOUNT_20 eval:hashcount_check(20)
score HASHCOUNT_20 0.01
header HASHCOUNT_SPAM_5 eval:hashcount_check_spam(5)
score HASHCOUNT_SPAM_5 1.0
header HASHCOUNT_TRAP eval:hashcount_check_trap()
score HASHCOUNT_TRAP 1.0
header HASHCOUNT_FLAG eval:hashcount_check_flag(0)
score HASHCOUNT_FLAG 2.0
=head1 DESCRIPTION
This module counts messages using nilsimsa signatures and hashing similar to
that uses by IxHash and NixSpam, and provides eval rule that can check this
count.
=head1 REQUIREMENT
This plugin is tested with MySQL only. It uses a table like this:
CREATE TABLE hashcount (
stamp INTEGER UNSIGNED NOT NULL DEFAULT 0,
type CHAR(4),
what CHAR(1),
hash VARCHAR(74),
help SMALLINT,
);
CREATE INDEX idx_hash ON hashcount (type,what,hash(33));
CREATE INDEX idx_help ON hashcount (type,what,help);
=head1 CONFIGURATION
=head2 Eval tests
=over
=item hashcount_check(limit,type,age)
True if a message identified by the hashing has been seen more than limit
times.
Type can be set to:
=over
=item *
fuzzy: only fuzzy matches are checked.
=item *
crisp: only non fuzzy matches are checked.
=item *
all: all matches are checked. This is the default.
=back
If age is specified, only checks for matches no older than age minutes.
NOTE: If hashcount_add is set to add messages before the testing, all
messages will normally be seen at least once before this check is
done.
Limit defaults to 100.
=item hashcount_check_spam(limit,type,age)
As hashcount_check(), but checks if a message identified by the hashing has
been considered as spam.
=item hashcount_check_trap(limit,type,age)
As hashcount_check(), but checks if a message identified by the hashing has
been seen by spam trap.
=item hashcount_check_flag(limit,type,age)
As hashcount_check(), but checks if a message identified by the hashing has
been manually flagged as spam.
=back
=head2 Options
=over
=item hashcount_sql_dsn
Wich database driver and database to use.
=item hashcount_sql_username
User name for the database connection.
=item hashcount_sql_password
Password for the database connection.
=item hashcount_add
If and when to add messages to the database.
=over
=item *
Include "before" or "start" to count messages before testing.
=item *
Include "after" or "end", to count messages after testing.
=item *
Include "spam" to count messages considered spam.
=back
=head1 NOTES
For the eval tests hashcount_check_trap and hashcount_check_flag to be
effective, some other software is needed.
We use the mail reporter and MIMEDefang filter at
http://whatever.frukt.org/mimedefangfilter.text.shtml
=head1 SEE ALSO
=over
=item IxHash
http://wiki.apache.org/spamassassin/iXhash
=item NixSpam
http://www.heise.de/ix/nixspam/
=back
=cut
package Mail::SpamAssassin::Plugin::HashCount;
# $Id: HashCount.pm,v 1.13 2009/06/26 11:52:16 jonas Exp $
use strict;
use base 'Mail::SpamAssassin::Plugin';
use DBI;
use Digest::MD5;
use Digest::SHA;
use Digest::Nilsimsa;
sub dbg {
my $msg = shift;
Mail::SpamAssassin::Plugin::dbg(sprintf("hashcount: $msg",@_));
}
sub new {
my ($class,$mailsa) = @_;
$class = ref($class) || $class;
my $self = $class->SUPER::new($mailsa);
bless($self,$class);
$self->{sqldb} = undef;
$self->{nilsimsa} = new Digest::Nilsimsa;
$self->{main}->{conf}->{hashcount_sql_dsn} = 'DBI:mysql:mdf:localhost';
$self->{main}->{conf}->{hashcount_sql_username} = 'sa';
$self->{main}->{conf}->{hashcount_sql_password} = 'pwd';
$self->register_eval_rule('hashcount_check');
$self->register_eval_rule('hashcount_check_spam');
$self->register_eval_rule('hashcount_check_trap');
$self->register_eval_rule('hashcount_check_flag');
dbg('registered');
return $self;
}
sub parse_config {
my ($self,$pars) = @_;
return 0 if ($pars->{user_config});
return 0 unless ($pars->{key} =~ /^hashcount_(sql_dsn|sql_username|sql_password|expire|add|stream)$/);
my $key = $1;
my $val = $pars->{value};
$val = '' if ($key =~ /(username|password)/);
$val = " = $val" if ($val);
dbg('config %s%s',$key,$val);
$self->{main}->{conf}->{$pars->{key}} = $pars->{value};
$self->inhibit_further_callbacks();
return 1;
}
sub _sql_connect {
my ($self) = @_;
return 1 if ($self->{sqldb});
#dbg('sql connect');
$self->{sqldb} = DBI->connect_cached(
$self->{main}->{conf}->{hashcount_sql_dsn},
$self->{main}->{conf}->{hashcount_sql_username},
$self->{main}->{conf}->{hashcount_sql_password},
{RaiseError=>0}
);
return 1 if ($self->{sqldb});
dbg('sql connect failed');
return 0;
}
sub _sql_disconnect {
my ($self) = @_;
if ($self->{sqldb}) {
#dbg('sql disconnect');
$self->{sqldb}->disconnect();
}
$self->{sqldb} = undef;
}
sub _count_hashes_hash {
my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
my $st;
foreach my $hash (@{$pms->{hashcounthashes}}) {
next unless ($hash && $hash->{t} eq 'H');
next if ($prefix && $hash->{h} !~ /^$prefix/);
unless ($st) {
my $cmd = 'SELECT count(stamp) FROM hashcount WHERE type=? AND what=? AND hash=? AND stamp>?';
dbg('Q %s',$cmd);
$st = $self->{sqldb}->prepare_cached($cmd);
unless ($st) {
dbg('sql prepare failed');
return 0;
}
$stamp = eval($stamp) if ($stamp);
if ($self->{main}->{conf}->{hashcount_expire}) {
my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
$stamp = $ts unless ($stamp && $stamp<$ts);
}
$stamp = $stamp ? time()-$stamp : 0;
}
dbg('E %s H %s %u',$counter,$hash->{h},$stamp);
$st->execute($counter,'H',$hash->{h},$stamp);
my @res = $st->fetchrow_array;
$st->finish if ($st);
my $cnt = (@res && $res[0]) ? $res[0] : 0;
dbg('count: %u',$cnt);
return 1 if ($cnt > $limit);
}
return 0;
}
sub _calc_nilsimsa {
my ($self,$val1,$sig2) = @_;
return (unpack('b256',($val1 ^ pack('H*',$sig2))) =~ tr/1//);
}
sub _count_hashes_nilsimsa {
my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
return 0 if ($prefix && $prefix !~ /~/i);
return 0 unless ($self->_sql_connect());
my $st;
foreach my $hash (@{$pms->{hashcounthashes}}) {
next unless ($hash && $hash->{t} eq 'N');
unless ($st) {
my $cmd = 'SELECT count(stamp),hash FROM hashcount WHERE type=? AND what=? AND (help IS NOT NULL AND help>? AND help<?) AND stamp>? GROUP BY hash';
dbg('Q %s',$cmd);
$st = $self->{sqldb}->prepare_cached($cmd);
unless ($st) {
dbg('sql prepare failed');
return 0;
}
$stamp = eval($stamp) if ($stamp);
if ($self->{main}->{conf}->{hashcount_expire}) {
my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
$stamp = $ts unless ($stamp && $stamp<$ts);
}
$stamp = $stamp ? time()-$stamp : 0;
}
dbg('E %s N %u %u',$counter,$hash->{b},$stamp);
$st->execute($counter,'N',$hash->{b}-10,$hash->{b}+10,$stamp);
my $cnt = 0;
while (my $res = $st->fetchrow_arrayref) {
next unless (@{$res} && $res->[1] && $res->[0]>0);
next unless ($self->_calc_nilsimsa($hash->{v},$res->[1]) < 10);
dbg('C: %u %s',$res->[0],$res->[1]);
$cnt += $res->[0];
next unless ($cnt > $limit);
dbg('count: %u',$cnt);
$st->finish;
return 1;
}
$st->finish;
dbg('count: %u',$cnt);
}
return 0;
}
sub _count_hashes {
my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
if ($prefix) {
if ($prefix =~ /fuzzy/i) {
$prefix = '~';
} elsif ($prefix =~ /crisp/i) {
$prefix = '=';
} elsif ($prefix =~ /all/i) {
$prefix = '[~=]';
}
} else {
$prefix = '';
}
$stamp = eval($stamp) if ($stamp);
if ($self->{main}->{conf}->{hashcount_expire}) {
my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
$stamp = $ts unless ($stamp && $stamp<$ts);
}
$stamp = $stamp ? time()-$stamp : 0;
return 0 unless ($self->_sql_connect());
if ($self->_count_hashes_hash($pms,$limit,$stamp,$counter,$prefix)) {
$self->_sql_disconnect();
return 1;
}
if ($self->_count_hashes_nilsimsa($pms,$limit,$stamp,$counter,$prefix)) {
$self->_sql_disconnect();
return 1;
}
$self->_sql_disconnect();
return 0;
}
sub _add_hashes {
my ($self,$pms,$acnt,$acnts) = @_;
my @cntl = ();
push @cntl, defined($self->{main}->{conf}->{hashcount_stream})?$self->{main}->{conf}->{hashcount_stream}:'mail' if ($acnt);
push @cntl, 'spam' if ($acnts);
return -1 unless (@cntl);
return -1 unless (@{$pms->{hashcounthashes}});
return 0 unless ($self->_sql_connect());
my $stih = $self->{sqldb}->prepare_cached('INSERT INTO hashcount (stamp,type,what,hash) VALUES (?,?,?,?)');
unless ($stih) {
dbg('sql prepare failed');
$self->_sql_disconnect();
return 0;
}
my $stin = $self->{sqldb}->prepare_cached('INSERT INTO hashcount (stamp,type,what,hash,help) VALUES (?,?,?,?,?)');
unless ($stin) {
dbg('sql prepare failed');
$stih->finish;
$self->_sql_disconnect();
return 0;
}
my $ok = 1;
my $now = time();
$self->{sqldb}->begin_work if ($self->{sqldb}->{AutoCommit});
foreach my $cnt (@cntl) {
foreach my $hash (@{$pms->{hashcounthashes}}) {
next unless ($hash);
next if ($hash->{h} =~ /^#/);
if ($hash->{t} eq 'H') {
dbg('add: %s H %s',$cnt,$hash->{h});
next if ($stih->execute($now,$cnt,'H',$hash->{h}));
} elsif ($hash->{t} eq 'N') {
dbg('add: %s N %u %s',$cnt,$hash->{b},$hash->{h});
next if ($stin->execute($now,$cnt,'N',$hash->{h},$hash->{b}));
}
dbg('sql insert failed');
$ok = 0;
last;
}
last unless ($ok);
}
if ($ok) {
$ok = $self->{sqldb}->commit unless ($self->{sqldb}->{AutoCommit});
} else {
$self->{sqldb}->rollback unless ($self->{sqldb}->{AutoCommit});
}
$stin->finish;
$stih->finish;
$self->_sql_disconnect();
return $ok;
}
# hash of message inc some headers
sub _make_hash0 {
my ($self,$body,$header) = @_;
$body =~ s/^[\s\n]+//s;
$body =~ s/[\s\n]+$//s;
my $hdr = '';
foreach my $h (@{$header}) {
next unless ($h =~ /^(?:Content-\S+|Subject):/i);
$hdr .= $h;
}
$body = "$hdr\n$body";
my $hash = join('+',Digest::MD5::md5_hex($body),Digest::SHA::sha1_hex($body));
dbg('hash0: %s',$hash);
return {t=>'H',h=>"=$hash"};
}
# IxHash style hashes
sub _make_hash1 {
my ($self,$body) = @_;
$body =~ s/([[:space:]]{100})(?:\1+)/$1/g;
$body =~ s/([[:space:]])(?:\1+)/$1/g;
$body =~ s/[[:graph:]]+//go;
my $hash = Digest::MD5::md5_hex($body);
dbg('hash1: %s',$hash);
return {t=>'H',h=>"~$hash"};
}
sub _make_hash2 {
my ($self,$body) = @_;
$body =~ s/[[:cntrl:][:alnum:]%&#;=]+//g;
$body =~ tr/_/./;
$body =~ s/([[:print:]]{100})(?:\1+)/$1/g;
$body =~ s/([[:print:]])(?:\1+)/$1/g;
my $hash = Digest::MD5::md5_hex($body);
dbg('hash2: %s',$hash);
return {t=>'H',h=>"~$hash"};
}
sub _make_hash3 {
my ($self,$body) = @_;
$body =~ s/[[:cntrl:][:space:]=]+//g;
$body =~ s/([[:print:]]{100})(?:\1+)/$1/g;
$body =~ s/([[:graph:]])(?:\1+)/$1/g;
my $hash = Digest::MD5::md5_hex($body);
dbg('hash3: %s',$hash);
return {t=>'H',h=>"~$hash"};
}
# Nilsimsa signature
sub _make_nilsimsa {
my ($self,$body) = @_;
my $sig = $self->{nilsimsa}->text2digest($body);
my $val = pack('H*',$sig);
my $bit = (unpack('b256',$val) =~ tr/1//);
dbg('nilsimsa: %u %s',$bit,$sig);
return {t=>'N',h=>$sig,v=>$val,b=>$bit};
}
sub _make_hashes {
my ($self,$pms) = @_;
return scalar @{$pms->{hashcounthashes}} if (defined($pms->{hashcounthashes}));
$pms->{hashcounthashes} = [];
my $ba = $pms->{msg}->get_body();
return 0 unless ($ba && @{$ba});
my $body = join('',@{$ba});
$body =~ s/\r\n/\n/gs;
$body =~ s/\r/\n/gs;
my @header;
my $header = $pms->{msg}->get_pristine_header();
if ($header) {
foreach my $l (split(/[\r\n]+/,$header)) {
last if ($l eq '');
if (@header && $l =~ /^\s/) {
$header[$#header] .= "$l\n";
} else {
push @header, "$l\n";
}
}
push @{$pms->{hashcounthashes}}, $self->_make_hash0($body,\@header);
}
push @{$pms->{hashcounthashes}}, $self->_make_hash1($body) if (($body =~ /(?:[\s\t].+?){20}/) && ($body =~ /\n.*?\n/));
push @{$pms->{hashcounthashes}}, $self->_make_hash2($body) if ($body =~ /(?:(?:(?:[<>\(\)\|@\*'!?,]){3}|(:\/)))/m);
push @{$pms->{hashcounthashes}}, $self->_make_hash3($body) if (!@{$pms->{hashcounthashes}} && ($body =~ /\S{4}.*\S{4}/));
push @{$pms->{hashcounthashes}}, $self->_make_nilsimsa($body);
#push @{$pms->{hashcounthashes}}, '#';
return scalar @{$pms->{hashcounthashes}};
}
sub check_start {
my ($self,$opt) = @_;
my $pms = $opt->{permsgstatus};
return 1 unless ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /(?:before|start)/i);
return 1 unless ($self->_make_hashes($pms));
$self->_add_hashes($pms,1);
return 1;
}
sub check_end {
my ($self,$opt) = @_;
my $pms = $opt->{permsgstatus};
my %chk = (
c => ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /(?:after|end)/i) ? 1 : 0,
s => ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /spam/i) ? $pms->is_spam() : 0,
);
return 1 unless (%chk);
return 1 unless ($self->_make_hashes($pms));
$self->_add_hashes($pms,$chk{c},$chk{s});
return 1;
}
sub hashcount_check {
my ($self,$pms,$limit,$type,$age) = @_;
$limit = defined($limit) ? eval($limit) : 100;
dbg('check limit: %u',$limit);
return 0 unless ($self->_make_hashes($pms));
my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,defined($self->{main}->{conf}->{hashcount_stream})?$self->{main}->{conf}->{hashcount_stream}:'mail',$type);
return ($hc > $limit) ? 1 : 0;
}
sub hashcount_check_spam {
my ($self,$pms,$limit,$type,$age) = @_;
$limit = defined($limit) ? eval($limit) : 10;
dbg('check spam limit: %u',$limit);
return 0 unless ($self->_make_hashes($pms));
my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'spam',$type);
return ($hc > $limit) ? 1 : 0;
}
sub hashcount_check_trap {
my ($self,$pms,$limit,$type,$age) = @_;
$limit = defined($limit) ? eval($limit) : 0;
dbg('check trap limit: %u',$limit);
return 0 unless ($self->_make_hashes($pms));
my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'trap',$type);
return ($hc > $limit) ? 1 : 0;
}
sub hashcount_check_flag {
my ($self,$pms,$limit,$type,$age) = @_;
$limit = defined($limit) ? eval($limit) : 0;
dbg('check flag limit: %u',$limit);
return 0 unless ($self->_make_hashes($pms));
my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'flag',$type);
return ($hc > $limit) ? 1 : 0;
}
1;
(2008-06-16)