Whatever

sa: HashCount.pm

=head1 NAME

HashCount - counts messages using nilsimsa and IxHash/NixSpam like hashes.

=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::HashCount /usr/local/etc/mail/spamassassin/plugins/HashCount.pm

	hashcount_sql_dsn             DBI:mysql:dbname:localhost
	hashcount_sql_username        user
	hashcount_sql_password        pass
	hashcount_add	                after,spam
	hashcount_stream              mail

	describe  HASHCOUNT_20        More than 20 copies of message
	header    HASHCOUNT_20        eval:hashcount_check(20)
	score     HASHCOUNT_20        0.01
  
	header    HASHCOUNT_SPAM_5    eval:hashcount_check_spam(5)
	score     HASHCOUNT_SPAM_5    1.0

	header     HASHCOUNT_TRAP     eval:hashcount_check_trap()
	score      HASHCOUNT_TRAP     1.0

	header     HASHCOUNT_FLAG     eval:hashcount_check_flag(0)
	score      HASHCOUNT_FLAG     2.0

=head1 DESCRIPTION

This module counts messages using nilsimsa signatures and hashing similar to
that uses by IxHash and NixSpam, and provides eval rule that can check this
count.

=head1 REQUIREMENT

This plugin is tested with MySQL only. It uses a table like this:

	CREATE TABLE hashcount (
	  stamp INTEGER UNSIGNED NOT NULL DEFAULT 0,
	  type CHAR(4),
	  what CHAR(1),
	  hash VARCHAR(74),
	  help SMALLINT,
	);
	CREATE INDEX idx_hash ON hashcount (type,what,hash(33));
	CREATE INDEX idx_help ON hashcount (type,what,help);

=head1 CONFIGURATION

=head2 Eval tests

=over

=item hashcount_check(limit,type,age)

True if a message identified by the hashing has been seen more than limit
times.

Type can be set to:

=over

=item *
fuzzy: only fuzzy matches are checked.

=item *
crisp: only non fuzzy matches are checked.

=item *
all: all matches are checked. This is the default.

=back

If age is specified, only checks for matches no older than age minutes.

NOTE: If hashcount_add is set to add messages before the testing, all
messages will normally be seen at least once before this check is
done.

Limit defaults to 100.

=item hashcount_check_spam(limit,type,age)

As hashcount_check(), but checks if a message identified by the hashing has
been considered as spam.

=item hashcount_check_trap(limit,type,age)

As hashcount_check(), but checks if a message identified by the hashing has
been seen by spam trap.

=item hashcount_check_flag(limit,type,age)

As hashcount_check(), but checks if a message identified by the hashing has
been manually flagged as spam.

=back

=head2 Options

=over

=item hashcount_sql_dsn

Wich database driver and database to use.

=item hashcount_sql_username

User name for the database connection.

=item hashcount_sql_password

Password for the database connection.

=item hashcount_add

If and when to add messages to the database.

=over

=item *
Include "before" or "start" to count messages before testing.

=item *
Include "after" or "end", to count messages after testing.

=item *
Include "spam" to count messages considered spam.

=back

=head1 NOTES

For the eval tests hashcount_check_trap and hashcount_check_flag to be
effective, some other software is needed.

We use the mail reporter and MIMEDefang filter at

	http://whatever.frukt.org/mimedefangfilter.text.shtml

=head1 SEE ALSO

=over

=item IxHash

	http://wiki.apache.org/spamassassin/iXhash

=item NixSpam

	http://www.heise.de/ix/nixspam/

=back

=cut

package Mail::SpamAssassin::Plugin::HashCount;

# $Id: HashCount.pm,v 1.13 2009/06/26 11:52:16 jonas Exp $

use strict;
use base 'Mail::SpamAssassin::Plugin';
use DBI;
use Digest::MD5;
use Digest::SHA;
use Digest::Nilsimsa;

sub dbg { 
	my $msg = shift;
	Mail::SpamAssassin::Plugin::dbg(sprintf("hashcount: $msg",@_));
}

sub new {
	my ($class,$mailsa) = @_;
	$class = ref($class) || $class;
	my $self = $class->SUPER::new($mailsa);
	bless($self,$class);
	$self->{sqldb} = undef;
	$self->{nilsimsa} = new Digest::Nilsimsa;
	$self->{main}->{conf}->{hashcount_sql_dsn} = 'DBI:mysql:mdf:localhost';
	$self->{main}->{conf}->{hashcount_sql_username} = 'sa';
	$self->{main}->{conf}->{hashcount_sql_password} = 'pwd';
	$self->register_eval_rule('hashcount_check');
	$self->register_eval_rule('hashcount_check_spam');
	$self->register_eval_rule('hashcount_check_trap');
	$self->register_eval_rule('hashcount_check_flag');
	dbg('registered');
	return $self;
}

sub parse_config {
	my ($self,$pars) = @_;
	return 0 if ($pars->{user_config});
	return 0 unless ($pars->{key} =~ /^hashcount_(sql_dsn|sql_username|sql_password|expire|add|stream)$/);
	my $key = $1;
	my $val = $pars->{value};
	$val = '' if ($key =~ /(username|password)/);
	$val = " = $val" if ($val);
	dbg('config %s%s',$key,$val);
	$self->{main}->{conf}->{$pars->{key}} = $pars->{value};
	$self->inhibit_further_callbacks();
	return 1;
}

sub _sql_connect {
	my ($self) = @_;
	return 1 if ($self->{sqldb});
	#dbg('sql connect');
	$self->{sqldb} = DBI->connect_cached(
				$self->{main}->{conf}->{hashcount_sql_dsn},
				$self->{main}->{conf}->{hashcount_sql_username},
				$self->{main}->{conf}->{hashcount_sql_password},
				{RaiseError=>0}
	);
	return 1 if ($self->{sqldb});
	dbg('sql connect failed');
	return 0;
}

sub _sql_disconnect {
	my ($self) = @_;
	if ($self->{sqldb}) {
		#dbg('sql disconnect');
		$self->{sqldb}->disconnect();
	}
	$self->{sqldb} = undef;
}

sub _count_hashes_hash {
	my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
	my $st;
	foreach my $hash (@{$pms->{hashcounthashes}}) {
		next unless ($hash && $hash->{t} eq 'H');
		next if ($prefix && $hash->{h} !~ /^$prefix/);
		unless ($st) {
			my $cmd = 'SELECT count(stamp) FROM hashcount WHERE type=? AND what=? AND hash=? AND stamp>?';
			dbg('Q %s',$cmd);
			$st = $self->{sqldb}->prepare_cached($cmd);
			unless ($st) {
				dbg('sql prepare failed');
				return 0;
			}
			$stamp = eval($stamp) if ($stamp);
			if ($self->{main}->{conf}->{hashcount_expire}) {
				my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
				$stamp = $ts unless ($stamp && $stamp<$ts);
			}
			$stamp = $stamp ? time()-$stamp : 0;
		}
		dbg('E %s H %s %u',$counter,$hash->{h},$stamp);
		$st->execute($counter,'H',$hash->{h},$stamp);
		my @res = $st->fetchrow_array;
		$st->finish if ($st);
		my $cnt = (@res && $res[0]) ? $res[0] : 0;
		dbg('count: %u',$cnt);
		return 1 if ($cnt > $limit);
	}
	return 0;
}

sub _calc_nilsimsa {
	my ($self,$val1,$sig2) = @_;
	return (unpack('b256',($val1 ^ pack('H*',$sig2))) =~ tr/1//);
}
sub _count_hashes_nilsimsa {
	my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
	return 0 if ($prefix && $prefix !~ /~/i);
	return 0 unless ($self->_sql_connect());
	my $st;
	foreach my $hash (@{$pms->{hashcounthashes}}) {
		next unless ($hash && $hash->{t} eq 'N');
		unless ($st) {
			my $cmd = 'SELECT count(stamp),hash FROM hashcount WHERE type=? AND what=? AND (help IS NOT NULL AND help>? AND help<?) AND stamp>? GROUP BY hash';
			dbg('Q %s',$cmd);
			$st = $self->{sqldb}->prepare_cached($cmd);
			unless ($st) {
				dbg('sql prepare failed');
				return 0;
			}
			$stamp = eval($stamp) if ($stamp);
			if ($self->{main}->{conf}->{hashcount_expire}) {
				my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
				$stamp = $ts unless ($stamp && $stamp<$ts);
			}
			$stamp = $stamp ? time()-$stamp : 0;
		}
		dbg('E %s N %u %u',$counter,$hash->{b},$stamp);
		$st->execute($counter,'N',$hash->{b}-10,$hash->{b}+10,$stamp);
		my $cnt = 0;
		while (my $res = $st->fetchrow_arrayref) {
			next unless (@{$res} && $res->[1] && $res->[0]>0);
			next unless ($self->_calc_nilsimsa($hash->{v},$res->[1]) < 10);
			dbg('C: %u %s',$res->[0],$res->[1]);
			$cnt += $res->[0];
			next unless ($cnt > $limit);
			dbg('count: %u',$cnt);
			$st->finish;
			return 1;
		}
		$st->finish;
		dbg('count: %u',$cnt);
	}
	return 0;
}

sub _count_hashes {
	my ($self,$pms,$limit,$stamp,$counter,$prefix) = @_;
	if ($prefix) {
		if ($prefix =~ /fuzzy/i) {
			$prefix = '~';
		} elsif ($prefix =~ /crisp/i) {
			$prefix = '=';
		} elsif ($prefix =~ /all/i) {
			$prefix = '[~=]';
		}
	} else {
		$prefix = '';
	}
	$stamp = eval($stamp) if ($stamp);
	if ($self->{main}->{conf}->{hashcount_expire}) {
		my $ts = $self->{main}->{conf}->{hashcount_expire}*24*60*60;
		$stamp = $ts unless ($stamp && $stamp<$ts);
	}
	$stamp = $stamp ? time()-$stamp : 0;
	return 0 unless ($self->_sql_connect());
	if ($self->_count_hashes_hash($pms,$limit,$stamp,$counter,$prefix)) {
		$self->_sql_disconnect();
		return 1;
	}
	if ($self->_count_hashes_nilsimsa($pms,$limit,$stamp,$counter,$prefix)) {
		$self->_sql_disconnect();
		return 1;
	}
	$self->_sql_disconnect();
	return 0;
}

sub _add_hashes {
	my ($self,$pms,$acnt,$acnts) = @_;
	my @cntl = ();
	push @cntl, defined($self->{main}->{conf}->{hashcount_stream})?$self->{main}->{conf}->{hashcount_stream}:'mail' if ($acnt);
	push @cntl, 'spam' if ($acnts);
	return -1 unless (@cntl);
	return -1 unless (@{$pms->{hashcounthashes}});
	return 0 unless ($self->_sql_connect());
	my $stih = $self->{sqldb}->prepare_cached('INSERT INTO hashcount (stamp,type,what,hash) VALUES (?,?,?,?)');
	unless ($stih) {
		dbg('sql prepare failed');
		$self->_sql_disconnect();
		return 0;
	}
	my $stin = $self->{sqldb}->prepare_cached('INSERT INTO hashcount (stamp,type,what,hash,help) VALUES (?,?,?,?,?)');
	unless ($stin) {
		dbg('sql prepare failed');
		$stih->finish;
		$self->_sql_disconnect();
		return 0;
	}
	my $ok = 1;
	my $now = time();
	$self->{sqldb}->begin_work if ($self->{sqldb}->{AutoCommit});
	foreach my $cnt (@cntl) {
		foreach my $hash (@{$pms->{hashcounthashes}}) {
			next unless ($hash);
			next if ($hash->{h} =~ /^#/);
			if ($hash->{t} eq 'H') {
				dbg('add: %s H %s',$cnt,$hash->{h});
				next if ($stih->execute($now,$cnt,'H',$hash->{h}));
			} elsif ($hash->{t} eq 'N') {
				dbg('add: %s N %u %s',$cnt,$hash->{b},$hash->{h});
				next if ($stin->execute($now,$cnt,'N',$hash->{h},$hash->{b}));
			}
			dbg('sql insert failed');
			$ok = 0;
			last;
		}
		last unless ($ok);
	}
	if ($ok) {
		$ok = $self->{sqldb}->commit unless ($self->{sqldb}->{AutoCommit});
	} else {
		$self->{sqldb}->rollback unless ($self->{sqldb}->{AutoCommit});
	}
	$stin->finish;
	$stih->finish;
	$self->_sql_disconnect();
	return $ok;
}

# hash of message inc some headers
sub _make_hash0 {
	my ($self,$body,$header) = @_;
	$body =~ s/^[\s\n]+//s;
	$body =~ s/[\s\n]+$//s;
	my $hdr = '';
	foreach my $h (@{$header}) {
		next unless ($h =~ /^(?:Content-\S+|Subject):/i);
		$hdr .= $h;
	}
	$body = "$hdr\n$body";
	my $hash = join('+',Digest::MD5::md5_hex($body),Digest::SHA::sha1_hex($body));
	dbg('hash0: %s',$hash);
	return {t=>'H',h=>"=$hash"};
}
# IxHash style hashes
sub _make_hash1 {
	my ($self,$body) = @_;
	$body =~ s/([[:space:]]{100})(?:\1+)/$1/g;
	$body =~ s/([[:space:]])(?:\1+)/$1/g;
	$body =~ s/[[:graph:]]+//go;
	my $hash = Digest::MD5::md5_hex($body);
	dbg('hash1: %s',$hash);
	return {t=>'H',h=>"~$hash"};
}
sub _make_hash2 {
	my ($self,$body) = @_;
	$body =~ s/[[:cntrl:][:alnum:]%&#;=]+//g;
	$body =~ tr/_/./;
	$body =~ s/([[:print:]]{100})(?:\1+)/$1/g;
	$body =~ s/([[:print:]])(?:\1+)/$1/g;
	my $hash = Digest::MD5::md5_hex($body);
	dbg('hash2: %s',$hash);
	return {t=>'H',h=>"~$hash"};
}
sub _make_hash3 {
	my ($self,$body) = @_;
	$body =~ s/[[:cntrl:][:space:]=]+//g;
	$body =~ s/([[:print:]]{100})(?:\1+)/$1/g;
	$body =~ s/([[:graph:]])(?:\1+)/$1/g;
	my $hash = Digest::MD5::md5_hex($body);
	dbg('hash3: %s',$hash);
	return {t=>'H',h=>"~$hash"};
}
# Nilsimsa signature
sub _make_nilsimsa {
	my ($self,$body) = @_;
	my $sig = $self->{nilsimsa}->text2digest($body);
	my $val = pack('H*',$sig);
	my $bit = (unpack('b256',$val) =~ tr/1//);
	dbg('nilsimsa: %u %s',$bit,$sig);
	return {t=>'N',h=>$sig,v=>$val,b=>$bit};
}
sub _make_hashes {
	my ($self,$pms) = @_;
	return scalar @{$pms->{hashcounthashes}} if (defined($pms->{hashcounthashes}));
	$pms->{hashcounthashes} = [];
	my $ba = $pms->{msg}->get_body();
	return 0 unless ($ba && @{$ba});
	my $body = join('',@{$ba});
	$body =~ s/\r\n/\n/gs;
	$body =~ s/\r/\n/gs;
	my @header;
	my $header = $pms->{msg}->get_pristine_header();
	if ($header) {
		foreach my $l (split(/[\r\n]+/,$header)) {
			last if ($l eq '');
			if (@header && $l =~ /^\s/) {
				$header[$#header] .= "$l\n";
			} else {
				push @header, "$l\n";
			}
		}
		push @{$pms->{hashcounthashes}}, $self->_make_hash0($body,\@header);
	}
	push @{$pms->{hashcounthashes}}, $self->_make_hash1($body) if (($body =~ /(?:[\s\t].+?){20}/) && ($body =~ /\n.*?\n/));
	push @{$pms->{hashcounthashes}}, $self->_make_hash2($body) if ($body =~ /(?:(?:(?:[<>\(\)\|@\*'!?,]){3}|(:\/)))/m);
	push @{$pms->{hashcounthashes}}, $self->_make_hash3($body) if (!@{$pms->{hashcounthashes}} && ($body =~ /\S{4}.*\S{4}/));
	push @{$pms->{hashcounthashes}}, $self->_make_nilsimsa($body);
	#push @{$pms->{hashcounthashes}}, '#';
	return scalar @{$pms->{hashcounthashes}};
}

sub check_start {
	my ($self,$opt) = @_;
	my $pms = $opt->{permsgstatus};
	return 1 unless ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /(?:before|start)/i);
	return 1 unless ($self->_make_hashes($pms));
	$self->_add_hashes($pms,1);
	return 1;
}

sub check_end {
	my ($self,$opt) = @_;
	my $pms = $opt->{permsgstatus};
	my %chk = (
		c => ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /(?:after|end)/i) ? 1 : 0,
		s => ($self->{main}->{conf}->{hashcount_add} && $self->{main}->{conf}->{hashcount_add} =~ /spam/i) ? $pms->is_spam() : 0,
	);
	return 1 unless (%chk);
	return 1 unless ($self->_make_hashes($pms));
	$self->_add_hashes($pms,$chk{c},$chk{s});
	return 1;
}

sub hashcount_check {
	my ($self,$pms,$limit,$type,$age) = @_;
	$limit = defined($limit) ? eval($limit) : 100;
	dbg('check limit: %u',$limit);
	return 0 unless ($self->_make_hashes($pms));
	my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,defined($self->{main}->{conf}->{hashcount_stream})?$self->{main}->{conf}->{hashcount_stream}:'mail',$type);
	return ($hc > $limit) ? 1 : 0;
}

sub hashcount_check_spam {
	my ($self,$pms,$limit,$type,$age) = @_;
	$limit = defined($limit) ? eval($limit) : 10;
	dbg('check spam limit: %u',$limit);
	return 0 unless ($self->_make_hashes($pms));
	my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'spam',$type);
	return ($hc > $limit) ? 1 : 0;
}

sub hashcount_check_trap {
	my ($self,$pms,$limit,$type,$age) = @_;
	$limit = defined($limit) ? eval($limit) : 0;
	dbg('check trap limit: %u',$limit);
	return 0 unless ($self->_make_hashes($pms));
	my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'trap',$type);
	return ($hc > $limit) ? 1 : 0;
}

sub hashcount_check_flag {
	my ($self,$pms,$limit,$type,$age) = @_;
	$limit = defined($limit) ? eval($limit) : 0;
	dbg('check flag limit: %u',$limit);
	return 0 unless ($self->_make_hashes($pms));
	my $hc = $self->_count_hashes($pms,$limit,$age?$age*60:0,'flag',$type);
	return ($hc > $limit) ? 1 : 0;
}

1;

(2008-06-16)