Whatever

sa: MimeMagic.pm

=head1 NAME

MimeMagic - provides eval test checking MIME types from header and file name/content.

=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::MimeMagic /usr/local/etc/mail/spamassassin/plugins/MimeMagic.pm
  
	mimemagic_map                  application/ole-document application/ole-storage
  
	describe  TYPE_MISMATCH        Content type mismatch
	header    TYPE_MISMATCH        eval:mimemagic_mismatch()
	score     TYPE_MISMATCH        0.5
  
	describe  EXE_MISMATCH         Content type mismatch for executable file
	header    EXE_MISMATCH         eval:mimemagic_mismatch_datatype('executable')
	score     EXE_MISMATCH         2.0

	describe  ZIPRAR_MISMATCH      ZIP file name for RAR file
	header    ZIPRAR_MISMATCH      eval:mimemagic_mismatch('-','/zip','/rar')
	score     ZIPRAR_MISMATCH      1.0
  
	describe  OFFICE_MISMATCH      Office document type mismatch
	header    OFFICE_MISMATCH      eval:mimemagic_mismatch_extension('doc','rtf','xls')
	score     OFFICE_MISMATCH      0.5
  
	describe  OLE_ATTACH           OLE part attached
	header    OLE_ATTACH           eval:mimemagic_find_datatype('application/ole-storage')
	score     OLE_ATTACH           0.01
  
=head1 DESCRIPTION

This module provides eval tests for MIME types and mismatches between part
MIME content type, file name, and file content.

=head1 REQUIREMENT

=over

=item *
SpamAssassin

=item *
IO:Scalar

=item *
File::MimeInfo::Magic

=item *
freedesktop mime database

=back

=head1 CONFIGURATION

=head2 Eval tests

The find tests simply checks if there is a part matching the specified
criteria.

The mismatch tests all checks for differences between the MIME type
specified for a part, the parts file extension (if any), and what the
parts contents looks like.

=over

=item mimemagic_find

Optional parameters (in the order as listed):

=over

=item *
extension as in mimemagic_find_extension

=item *
mime type as in mimemagic_find_extensiontype

=item *
mime type as in mimemagic_find_datatype

=item *
mime type as in mimemagic_find_contenttype

=back

Check for parts matching the provided parameters or
any part at all if no parameters are specified.

=item mimemagic_find_extension

Required parameter: extension

Check for parts with a file name matching the specified extension.

Specifying the extension '' will match empty file names and file
names without extension but not parts without a file name.

=item mimemagic_find_extensiontype

Required parameter: MIME type

Check for parts with a file name with a extension mapped to the
specified MIME type.

=item mimemagic_find_datatype

Required parameter: MIME type

Check for parts with content that seems to be of the specified
MIME type.

=item mimemagic_find_contenttype

Required parameter: MIME type

Check for parts with a a content type matching the specified
MIME type.

=item mimemagic_mismatch

Optional parameters (in the order as listed):

=over

=item *
extension as in mimemagic_mismatch_extension

=item *
mime type as in mimemagic_mismatch_extensiontype

=item *
mime type as in mimemagic_mismatch_datatype

=item *
mime type as in mimemagic_mismatch_contenttype

=back

Checks all parts matching the provided parameters or
all parts if no parameters are specified.

=item mimemagic_mismatch_extension

Required parameter: extension

Only check parts with a file name matching the specified extension.

Specifying the extension '' will match empty file names and file
names without extension but not parts without a file name.

=item mimemagic_mismatch_extensiontype

Required parameter: MIME type

Only check parts with a file name with a extension mapped to the
specified MIME type.

=item mimemagic_mismatch_datatype

Required parameter: MIME type

Only check parts with content that seems to be of the specified
MIME type.

=item mimemagic_mismatch_contenttype

Required parameter: MIME type

Only check parts with a a content type matching the specified
MIME type.

=back

=head2 Options

=over

=item mimemagic_map

Map one MIME type to another for the mismatch checks.

=item mimemagic_datasize

Maximum size of data to check (default 16KB).

=item mimemagic_stderr

Print debug output to STDERR.

=back

=cut

package Mail::SpamAssassin::Plugin::MimeMagic;

# $Id: MimeMagic.pm,v 1.7 2009/06/30 14:01:22 jonas Exp $

use strict;
use base 'Mail::SpamAssassin::Plugin';
use IO::Scalar;
use File::MimeInfo::Magic ();

sub new {
	my ($class,$mailsa) = @_;
	$class = ref($class) || $class;
	my $self = $class->SUPER::new($mailsa);
	bless($self,$class);
	$self->register_eval_rule('mimemagic_mismatch');
	$self->register_eval_rule('mimemagic_mismatch_extension');
	$self->register_eval_rule('mimemagic_mismatch_contenttype');
	$self->register_eval_rule('mimemagic_mismatch_extensiontype');
	$self->register_eval_rule('mimemagic_mismatch_datatype');
	$self->register_eval_rule('mimemagic_find');
	$self->register_eval_rule('mimemagic_find_extension');
	$self->register_eval_rule('mimemagic_find_contenttype');
	$self->register_eval_rule('mimemagic_find_extensiontype');
	$self->register_eval_rule('mimemagic_find_datatype');
	$self->{typemaps} = {
		'application/pgpsignature'	=> 'text/plain',
		'application/mbox'		=> 'text/plain',
		'message/rfc822'		=> 'text/plain',
		'application/msword'		=> 'application/olestorage',
		'application/vndmsexcel'	=> 'application/olestorage',
	};
	$self->{cfg_datasize} = 16*1024;
	#$self->dbg('registered');
	return $self;
}

sub dbg {
	my $self = shift;
	my $msg = shift;
	Mail::SpamAssassin::Plugin::dbg(sprintf("mimemagic: $msg",@_));
	print STDERR sprintf("mimemagic: $msg\n",@_) if (defined($self) && $self->{cfg_stderr});
}

sub parse_config {
	my ($self,$pars) = @_;
	return 0 unless ($pars->{key} =~ /^mimemagic_(.+)$/i);
	my $key = lc($1);
	my @val = split(/\s+/,$pars->{value});
	if ($key eq 'map') {
		return 0 unless ($#val == 1);
		$self->{typemaps}->{$self->_simplify($val[0])} = $self->_simplify($val[1]);
		#$self->dbg('cfg map: "%s" -> "%s"',$val[0],$val[1]);
	} elsif ($pars->{user_config}) {
		return 0;
	} elsif ($key =~ /^(?:stderr|datasize)$/) {
		$self->{"cfg_$key"} = ($key eq 'datasize') ? eval($pars->{value}) : $pars->{value};
		#$self->dbg('cfg %s: "%s"',$key,$self->{"cfg_$key"});
	} else {
		return 0;
	}
	$self->inhibit_further_callbacks();
	return 1;
}

sub _simplify {
	my ($self,$mt,$def) = @_;
	$mt = lc($mt ? $mt : $def ? $def : '');
	$mt =~ s/(^|\/)[Xx]-/$1/g;
	$mt =~ s/[-_.]//g;
	return $mt;
}

sub _massage_item {
	my ($self,$mt,$ext) = @_;
	return $mt unless (defined($mt));
	if ($ext) {
		$mt =~ s/^\.//;
	} else {
		$mt = $self->_simplify($mt);
	}
	$mt =~ s/([^-_.*?A-Za-z0-9])/\\$1/g;
	return $mt;
}
sub _massage_list {
	my ($self,$mtl,$ext) = @_;
	return undef unless (defined($mtl));
	my @ml = ();
	foreach my $mt (@{$mtl}) {
		push @ml, $self->_massage_item($mt,$ext) if (defined($mt));
	}
	return undef unless (@ml);
	return $ml[0] unless ($#ml);
	return '(?:'.join('|',@ml).')';
}
sub _massage_string {
	my ($self,$mtl,$ext) = @_;
	return undef unless (defined($mtl));
	return $self->_massage_list([split(/\s*;\s*/,$mtl)],$ext);
}
sub _massage {
	my ($self,$mtl,$ext) = @_;
	return $self->_massage_string($mtl,$ext) unless (ref($mtl));
	return $self->_massage_list($mtl,$ext) if (ref($mtl) eq 'ARRAY');
	return $self->_massage_string($$mtl,$ext) if (ref($mtl) eq 'SCALAR');
	return undef;
}

sub _compare_fuzz {
	my ($self,$mt1,$mt2) = @_;
	return 1 if ($mt1 eq '*BIN*' && $mt2 !~ /^(text|message)\//);
	return 1 if ($mt1 eq '*TXT*' && $mt2 =~ /^(text|message)\//);
	return 1 if ($mt1 eq 'application/octetstream' && $mt2 !~ /^(text|message)\//);
	return 1 if (defined($self->{typemaps}->{$mt1}) && $self->{typemaps}->{$mt1} eq $mt2);
	return 0;
}
sub _compare_types {
	my ($self,$mt1,$mt2) = @_;
	#$self->dbg('"%s" ?= "%s"',$mt1,$mt2);
	return 1 if ($mt1 eq $mt2);
	#$self->dbg('"%s" != "%s"',$mt1,$mt2);
	return -1 if ($mt1 eq '' || $mt2 eq '');
	return -1 if ($self->_compare_fuzz($mt1,$mt2));
	return -1 if ($self->_compare_fuzz($mt2,$mt1));
	#$self->dbg('"%s" !~ "%s"',$mt1,$mt2);
	return 0;
}
sub _compare {
	my ($self,$mt1,$mt2) = @_;
	my $res = $self->_compare_types($mt1,$mt2);
	$self->dbg('"%s" %s "%s"',$mt1,$res>0?'=':$res<0?'~':'!',$mt2);
	return $res;
}

sub _load {
	my ($self) = @_;
	$self->{mmagic} = File::MimeInfo::Magic->new() unless ($self->{mmagic});
	return $self->{mmagic} ? 1 : 0;
}

sub _type_data {
	my ($self,$part) = @_;
	return '' unless ($part->is_leaf);
	if ($part->{magic_mime_type}) {
		#$self->dbg('td stored magic type: "%s"',$part->{magic_mime_type});
		return $self->_simplify($part->{magic_mime_type});
	}
	return '' unless ($self->_load);
	my $data = ($self->{cfg_datasize} > 0) ? $part->decode($self->{cfg_datasize}) : $part->decode();
	return '' unless (defined($data));
	my $dh = new IO::Scalar(\$data);
	my $mt = $self->{mmagic}->magic($dh);
	$dh->close();
	unless (defined($mt)) {
		no warnings;
		if (utf8::valid($data)) {
			use bytes;
			return '*TXT*' unless ($data =~ /[^\m\t\r\n\s\x20-\xF6\xF9-\xFF]/s);
		} else {
			return '*TXT*' unless ($data =~ /[^\m\t\r\n\s\x20-\x7E\x80-\xFE]/s);
		}
		return '*BIN*'
	}
	#$self->dbg('td magic type: "%s"',$mt);
	$part->{magic_mime_type} = $mt;
	return $self->_simplify($mt);
}

sub _type_name {
	my ($self,$part) = @_;
	return '' unless ($part->is_leaf);
	my $fn = $part->{name};
	return '' unless (defined($fn) && $fn ne '');
	return '' unless ($self->_load);
	my $mt = $self->{mmagic}->globs($part->{name});
	return $self->_simplify($mt);
}

sub _check_parts {
	my ($self,$pms,$mis,$ext,$extmt,$datmt,$cntmt) = @_;
	$ext = $self->_massage($ext,1);
	$extmt = $self->_massage($extmt);
	$datmt = $self->_massage($datmt);
	$cntmt = $self->_massage($cntmt);
	#$self->dbg('ext  @ "%s"',$ext) if (defined($ext));
	#$self->dbg('extmt  @ "%s"',$extmt) if (defined($extmt));
	#$self->dbg('datmt  @ "%s"',$datmt) if (defined($datmt));
	#$self->dbg('cntmt  @ "%s"',$cntmt) if (defined($cntmt));
	my $pc = 0;
	$pms->{mimemagic_parts} = {} unless (defined($pms->{mimemagic_parts}));
	foreach my $part ($pms->{msg}->find_parts('.*',1,1)) {
		$pc ++;
		#$self->dbg('name  : "%s"',$part->{name}) if (defined($part->{name}));
		if (defined($ext)) {
			next unless (defined($part->{name}));
			next unless ($part->{name} =~ /\.$ext$/i || ($ext eq '' && $part->{name} !~ /\./));
		}
		$pms->{mimemagic_parts}->{$pc} = {} unless (defined($pms->{mimemagic_parts}->{$pc}));
		$pms->{mimemagic_parts}->{$pc}->{mtc} = $self->_simplify($part->{type}) unless (defined($pms->{mimemagic_parts}->{$pc}->{mtc}));
		#$self->dbg('cntmt : "%s"',$pms->{mimemagic_parts}->{$pc}->{mtc});
		next if (defined($cntmt) && $pms->{mimemagic_parts}->{$pc}->{mtc} !~ /$cntmt/i);
		$pms->{mimemagic_parts}->{$pc}->{mtn} = $self->_type_name($part) unless (defined($pms->{mimemagic_parts}->{$pc}->{mtn}));
		#$self->dbg('extmt : "%s"',$pms->{mimemagic_parts}->{$pc}->{mtn});
		next if (defined($extmt) && $pms->{mimemagic_parts}->{$pc}->{mtn} !~ /$extmt/i);
		$pms->{mimemagic_parts}->{$pc}->{mtd} = $self->_type_data($part) unless (defined($pms->{mimemagic_parts}->{$pc}->{mtd}));
		#$self->dbg('datmt : "%s"',$pms->{mimemagic_parts}->{$pc}->{mtd});
		next if (defined($datmt) && $pms->{mimemagic_parts}->{$pc}->{mtd} !~ /$datmt/i);
		return 1 unless ($mis);
		$pms->{mimemagic_parts}->{$pc}->{res} = (
			$self->_compare($pms->{mimemagic_parts}->{$pc}->{mtc},$pms->{mimemagic_parts}->{$pc}->{mtd}) &&
			$self->_compare($pms->{mimemagic_parts}->{$pc}->{mtd},$pms->{mimemagic_parts}->{$pc}->{mtn}) &&
			$self->_compare($pms->{mimemagic_parts}->{$pc}->{mtn},$pms->{mimemagic_parts}->{$pc}->{mtc})
		) unless (defined($pms->{mimemagic_parts}->{$pc}->{res}));
		next if ($pms->{mimemagic_parts}->{$pc}->{res});
		return 1;
	}
	return 0;
}	

sub mimemagic_mismatch {
	my ($self,$pms,$ext,$extmt,$datmt,$cntmt) = @_;
	$ext = undef if (defined($ext) && $ext =~ /^[-*?]$/);
	$extmt = undef if (defined($extmt) && $extmt =~ /^[-*?]$/);
	$datmt = undef if (defined($datmt) && $datmt =~ /^[-*?]$/);
	$cntmt = undef if (defined($cntmt) && $cntmt =~ /^[-*?]$/);
	return _check_parts($self,$pms,1,$ext,$extmt,$datmt,$cntmt);
}

sub mimemagic_mismatch_extension {
	my ($self,$pms,@ext) = @_;
	return 0 unless (@ext);
	return _check_parts($self,$pms,1,\@ext);
}

sub mimemagic_mismatch_extensiontype {
	my ($self,$pms,@extmt) = @_;
	return 0 unless (@extmt);
	return _check_parts($self,$pms,1,undef,\@extmt);
}

sub mimemagic_mismatch_datatype {
	my ($self,$pms,@datmt) = @_;
	return 0 unless (@datmt);
	return _check_parts($self,$pms,1,undef,undef,\@datmt);
}

sub mimemagic_mismatch_contenttype {
	my ($self,$pms,@cntmt) = @_;
	return 0 unless (@cntmt);
	return _check_parts($self,$pms,1,undef,undef,undef,\@cntmt);
}

sub mimemagic_find {
	my ($self,$pms,$ext,$extmt,$datmt,$cntmt) = @_;
	$ext = undef if (defined($ext) && $ext =~ /^[-*?]$/);
	$extmt = undef if (defined($extmt) && $extmt =~ /^[-*?]$/);
	$datmt = undef if (defined($datmt) && $datmt =~ /^[-*?]$/);
	$cntmt = undef if (defined($cntmt) && $cntmt =~ /^[-*?]$/);
	return _check_parts($self,$pms,0,$ext,$extmt,$datmt,$cntmt);
}

sub mimemagic_find_extension {
	my ($self,$pms,@ext) = @_;
	return 0 unless (@ext);
	return _check_parts($self,$pms,0,\@ext);
}

sub mimemagic_find_extensiontype {
	my ($self,$pms,@extmt) = @_;
	return 0 unless (@extmt);
	return _check_parts($self,$pms,0,undef,\@extmt);
}

sub mimemagic_find_datatype {
	my ($self,$pms,@datmt) = @_;
	return 0 unless (@datmt);
	return _check_parts($self,$pms,0,undef,undef,\@datmt);
}

sub mimemagic_find_contenttype {
	my ($self,$pms,@cntmt) = @_;
	return 0 unless (@cntmt);
	return _check_parts($self,$pms,0,undef,undef,undef,\@cntmt);
}

1;

(2008-02-01)