Whatever

SA: ExtractText.pm

=head1 NAME

ExtractText - extracts text from documenmts.

=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::ExtractText /usr/local/etc/mail/spamassassin/plugins/ExtractText.pm

	extracttext_mime_magic    yes

	extracttext_external      antiword     {CS:UTF-8} /usr/local/bin/antiword -t -w 0 -m UTF-8.txt -
	extracttext_use           antiword     .doc application/(?:vnd\\.?)?ms-?word.*

	extracttext_module        openxml      Mail::SpamAssassin::Plugin::ExtractText::OpenXML
	extracttext_use           openxml      .docx .dotx .dotm application/(?:vnd\\.?)openxml.*?word.*
	extracttext_use           openxml      .doc .dot application/(?:vnd\\.?)?ms-?word.*

	extracttext_external      unrtf        {CS:UTF-8} {CF:<{\\[:=-.*?-=:\\]}>} /usr/local/bin/unrtf -t ExtractText.tags --nopict
	extracttext_use           unrtf        .doc .rtf application/rtf text/rtf
  
	extracttext_external      odt2txt      {CS:UTF-8} {CF:\\[--\\s+\\S+:\\s.*?--\\]} /usr/local/bin/odt2txt --encoding=UTF-8 ${file}
	extracttext_use           odt2txt      .odt .ott application/.*?opendocument.*text
	extracttext_use           odt2txt      .sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter

	extracttext_module        dummy        Mail::SpamAssassin::Plugin::ExtractText::Dummy
	extracttext_dummy_test1                one two three
	extracttext_dummy_test2                four five

	header                    DOC_NO_TEXT  X-ExtractText-Flags =~ /(?:antiword|openxml|unrtf|odt2txt)_Notext/
	describe                  DOC_NO_TEXT  Document without text
	score                     DOC_NO_TEXT  0.5

=head1 DESCRIPTION

This module uses external tools or plugins to extract text from message parts,
and then sets the text as the rendered part.

How to extract text from what is completely configuarable, and bases on a
part's MIME type and file name and optionally also the part's content.

=head1 REQUIREMENT

=over

=item *
SpamAssassin

=item *
IO::String;

=item *
Text::ParseWords

=item *
IPC::Run3

=item *
Encode

=item *
Encode::Detect

=back

=head2 Optional

=over

=item *
File::MimeInfo::Magic

=item *
freedesktop mime database

=back

=head1 CONFIGURATION

In the configuration options, \ is used as an escape character. To include an
actual \ (in regexes for example), use \\.

=head2 Options

=over

=item extracttext_log_to_stderr

Makes the plugin write debug and information to STDERR as well as using the
normal SpamAssassin calls.

=item extracttext_log_msgid

Makes the plugin include the Message-ID in debug and information output.

=item extracttext_log_text

Makes the plugin log all extracted text.

=item extracttext_mime_magic

Specifies wether to use File::MimeInfo::Magic to get canonical MIME types, to
try extracting text from parts with erroneous MIME type declarations, and to
set MIME types for new objects when a plugin didn't.

=item extracttext_mime_database

Sepcifies where the freedesktop MIME database is located.

=back

=head2 Tools

=over

=item extracttext_use

Specifies what tool to use for what message parts.

The general syntax is

	extracttext_use  <name>  <specifiers>

=over

=item name

the internal name of a tool.

=item specifiers

File extension and regular expressions for file names and MIME
types. The regular experssions are anchored to beginning and end.

=back

=head3 Examples

	extracttext_use  antiword  .doc application/(?:vnd\\.?)?ms-?word.*
	extracttext_use  openxml   .docx .dotx .dotm application/(?:vnd\\.?)openxml.*?word.*
	extracttext_use  openxml   .doc .dot application/(?:vnd\\.?)?ms-?word.*
	extracttext_use  unrtf     .doc .rtf application/rtf text/rtf

=item extracttext_external

Defines an external tool. The tool must read a document on standard input or
from a file and write text to standard output.

The general syntax is

	extracttext_external <name> [options] <command> [parameters]

=over

=item name

The internal name of this tool.

=item command

The full path to the external command to run.

=item parameters

Parameters for the external command. The string ${file} in a
parameter will be replaced with the file name of a temporary file
containing the document.

=item options

See below

=back

=head3 Examples

	extracttext_external  antiword  {CS:UTF-8} /usr/local/bin/antiword -t -w 0 -m UTF-8.txt -
	extracttext_external  unrtf     {CS:UTF-8} {CF:<{\\[:=-.*?-=:\\]}>} /usr/local/bin/unrtf -t ExtractText.tags --nopict
	extracttext_external  odt2txt   {CS:UTF-8} /usr/local/bin/odt2txt --encoding=UTF-8 ${file}

=item extracttext_module

Defines a plugin module. The module must implement the function Extract.

The general syntax is

	extracttext_module <name> [options] <package> [path]

=over

=item name

The internal name of this tool.

=item package

The full package name of the module.

=item path

The full path to the module. If this is not specified, it will be
searched for in @INC. If the package name defines the module as a sub
module of this module, it will also be searched for in the sub
directory of this module.

=item options

See below

=back

=head3 Example

	extracttext_module  openxml  Mail::SpamAssassin::Plugin::ExtractText::OpenXML

=item options

The general syntax for options to extracttext_external and extracttext_module
is

	{Option:value}

=over

=item CharSet

Character set used for decoding the text. Defaults do 'detect'
for extracttext_external. A setting of <xml> means to get the
character set from an XML header.

=item CommentFilter

A regular expression removing text.

=back

=back

=head2 Metadata

The plugin adds some pseudo headers to the message. These headers are seen by
the bayes system, and can be used in normal SpamAssassin rules.

The headers are also available as template tags as noted below.

=head3 Example

The fictional example headers below are based on a message containing this:

=over

=item 1
A perfectly normal PDF.

=item 2
An OpenXML document with a word document inside.
Neither Office document contains text.

=back

=head3 Headers

=over

=item X-ExtractText-Chars

Tag: _EXTRACTTEXTCHARS_

Contains a count of characters that were extracted.

	X-ExtractText-Chars: 10970

=item X-ExtractText-Words

Tag: _EXTRACTTEXTWORDS_

Contains a count of "words" that were extracted.

	X-ExtractText-Chars: 1599

=item X-ExtractText-Tools

Tag: _EXTRACTTEXTTOOLS_

Contains chains of tools used for extraction.

	X-ExtractText-Tools: pdftohtml openxml_antiword

=item X-ExtractText-Types

Tag: _EXTRACTTEXTTYPES_

Contains chains of MIME types for parts found during extraction.

	X-ExtractText-Types: application/pdf; application/vnd.openxmlformats-officedocument.wordprocessingml.document, application/ms-word

=item X-ExtractText-Extensions

Tag: _EXTRACTTEXTEXTENSIONS_

Contains chains of canonicalized file extensions (not from headers) for parts
found during extraction.

	X-ExtractText-Extensions: pdf docx_doc

=item X-ExtractText-Flags

Tag: _EXTRACTTEXTFLAGS_

Contains notes from the plugin.

	X-ExtractText-Flags: openxml_NoText

=back

=head3 Rules

Example:

	header    PDF_NO_TEXT  X-ExtractText-Flags =~ /pdftohtml_Notext/
	describe  PDF_NO_TEXT  PDF without text
	score     PDF_NO_TEXT  0.25

=head1 PLUGINS

A plugin is a simple module that implements a function called Extract.

I suggest that plugins have package names like this:

	Mail::SpamAssassin::Plugin::ExtractText::*

If the plugin need to be configured, this can be done with the function
Configure.

=head2 Extract

=head3 Call

The Extract function looks like this:

	sub Extract($extracttext,$object)

=over

=item $extracttext

The calling Mail::SpamAssassin::Plugin::ExtractText object.

=item $object

A decoded or extracted document object (see below).

=back

=head3 Return

Extract should return this (any return parameter may be undef):

	($error,$text,\@newobjects)

=over

=item $error

An error message if applicable.

=item $text

The extrated text, if any.

=item \@newobjects

An array reference of objects that should be processed by this plugin.

=back

=head2 Configure

Note: any extractor implementing Configure must be defined before any
configuration lines it is supposed to handle.

The optional Configure function is called when keys of the following format
if encountered in the spamassassin configuration:

	extracttext_<tool>_<key>

If any values are specified, they are split by white space. To avoid this use
quotes (") or escape with "\". To include an actual \, use \\.

=over

=item <tool>

The internal name of an extractor implemented as a module.

=item <key>

A string conforming to SpamAssassin configuration key format.

=back

=head3 Call

The Configure function looks like this:

	sub Configure($extracttext,$key,@values)

=over

=item $extracttext

The calling Mail::SpamAssassin::Plugin::ExtractText object.

=item $key

The (down cased) configuration key.

=item @values

The values following the key (if any).

=back

=head3 Return

Configure should return true if it handled the key and false otherwise.

=head2 Objects

A document object is a hash reference with the following contents.

=over

=item data

The raw data in a scalar reference.

=item file

The data file (this will be deleted).

=item type

MIME Type.

=item name

File name.

=back

Either data or file must be present.

A plugin must be able to handle both data and file.

=cut

package Mail::SpamAssassin::Plugin::ExtractText;

# $Id: ExtractText.pm,v 1.25 2009/07/10 13:58:14 jonas Exp $

use strict;
use base 'Mail::SpamAssassin::Plugin';
use Mail::SpamAssassin::Util ();
use IO::String;
use Text::ParseWords;
use IPC::Run3;
use Encode;
use Encode::Detect;

sub new {
	my ($class,$mailsa) = @_;
	$class = ref($class) || $class;
	my $self = $class->SUPER::new($mailsa);
	bless($self,$class);
	$self->{match} = [];
	$self->{tools} = {};
	$self->{magic} = 0;
	$self->{modul} = {};
	$self->{canon} = {};
	$self->{depth} = 32;
	$self->register_method_priority('post_message_parse',-1);
	return $self;
}

sub _logmsg {
	my $self = shift;
	my $lev = shift;
	my $msg = shift;
	for (my $i=0;$i<@_;$i++) { $_[$i] = '' unless (defined($_[$i])); }
	$msg = $self->{curmid} ? sprintf("extracttext: %s $msg",$self->{curmid},@_) : sprintf("extracttext: $msg",@_);
	print STDERR "[$lev] $msg\n" if ($self->{stderr});
	return $msg;
}

sub dbg {
	my $self = shift;
	Mail::SpamAssassin::Plugin::dbg($self->_logmsg('dbg',@_));
}

sub info {
	my $self = shift;
	Mail::SpamAssassin::Plugin::info($self->_logmsg('inf',@_));
}

sub isch {
	my $self = shift;
	warn($self->_logmsg('err',@_));
}


sub parse_config {
	my ($self,$pars) = @_;
	return 0 if ($pars->{user_config});
	return 0 unless ($pars->{key} =~ /^extracttext_(.+)$/i);
	my $key = lc($1);
	my @val = shellwords($pars->{value});
	if ($key eq 'use') {
		my $tool = lc(shift @val);
		return 0 unless ($tool && @val);
		while (@val) {
			my $what = shift @val;
			if ($what ne '') {
				my $where;
				if ($what =~ /.+\/.+/) {
					$where = 'type';
				} else {
					$where = 'name';
					$what = ".*\\$what"  if ($what =~ /^\.[a-zA-Z0-9]+$/);
				}
				push @{$self->{match}}, {where=>$where,what=>$what,tool=>$tool};
				$self->dbg('use: %s %s %s',$tool,$where,$what);
			}
		}
	} elsif ($key =~ /^external|module$/) {
		my $name = lc(shift @val);
		return 0 unless ($name && @val);
		if ($self->{tools}->{$name}) {
			$self->isch('Tool exists: %s',$name);
			return 0;
		}
		my $tool = {name=>$name,type=>$key};
		while (@val && $val[0] =~ /^\{(.*?)\}$/) {
			my $cmd = $1;
			my $val = 1;
			if ($cmd =~ /^(.*?):(.*)$/) {
				$cmd = $1;
				$val = $2;
			}
			if ($cmd =~ /^C(?:har)?S(?:et)?$/i) {
				$tool->{charset} = $val;
			} elsif ($cmd =~ /^C(?:omment)?F(?:ilter)?$/i) {
				$tool->{comrexp} = [] unless ($tool->{comrexp});
				push @{$tool->{comrexp}}, $val;
			} else {
				$self->isch('Bad tool config: %s %s',$tool->{name},$cmd);
				return 0;
			}
			shift @val;
		}
		return 0 unless (@val);
		$tool->{spec} = \@val;
		if ($tool->{type} eq 'external') {
			unless (-x $tool->{spec}->[0]) {
				$self->isch('Missing tool: %s %s',$tool->{name},$tool->{spec}->[0]);
				return 0;
			}
			$tool->{charset} = 'detect' unless ($tool->{charset});
		} elsif ($tool->{type} eq 'module') {
			my $package = $tool->{spec}->[0];
			my $path = $tool->{spec}->[1];
			unless ($package) {
				$self->isch('Bad module: %s',$tool->{name});
				return 0;
			}
			unless (defined($self->{modul}->{$package})) {
				unless ($path) {
					$self->{modul}->{$package} = eval("require $package;");
					unless ($self->{modul}->{$package} || substr($package,0,length(__PACKAGE__)+2) ne __PACKAGE__.'::') {
						$path = __FILE__;
						$path =~ s/\.pm$//;
						$path .= substr($package,length(__PACKAGE__),length($package)).'.pm';
						$path =~ s/::/\//g;
					}
				}
				if ($path) {
					$self->{modul}->{$package} = eval{ require $path; };
					$self->isch('Error loading module: %s %s %s',$package,$path,$@) unless ($self->{modul}->{$package});
				} else {
					$self->isch('Error loading module: %s %s',$package,$@) unless ($self->{modul}->{$package});
				}
			}
			return 0 unless ($self->{modul}->{$package});
		} else {
			return 0;
		}
		$self->{tools}->{$name} = $tool;
		$self->dbg('%s: %s "%s"',$key,$name,join('","',@{$tool->{spec}}));
	} elsif ($key =~ /^
			(?:use_?)?(?:mime_?)?(magic)		|
			(?:log_?(?:to_?)?)?(stderr)		|
			log_?(msgid)				|
			log_?(text)
			$/x) {
		$key = $+;
		if (!@val) {
			$self->{$key} = 1;
		} elsif ($val[0] =~ /^no?|f(?:alse)?|off?|[-+]?0+$/i) {
			$self->{$key} = 0;
		} elsif ($val[0] =~ /^y(?:es)?|t(?:rue)?|on|[-+]\d+$/i) {
			$self->{$key} = 1;
		}
		$self->dbg('set: %s=%s',$key,$self->{$key});
	} elsif (@val && $key =~ /^
			(mime)(?:_?dir|(?:_?data)?(?:_?base)?|(?:_?db)?)?
			$/x) {
		$key = $+;
		$self->{$key} = $val[0];
		$self->dbg('set: %s=%s',$key,$self->{$key});
	} elsif (@val && $val[0] =~ /^\d+$/ && $key =~ /^
			(?:max_?)?(depth)
			$/x) {
		$key = $+;
		$self->{$key} = $val[0];
		$self->dbg('set: %s=%u',$key,$self->{$key});
	} elsif ($key =~ /^([^_]+)_(.+)$/) {
		my $name = $1;
		$key = $2;
		return 0 unless ($self->{tools}->{$name});
		return 0 unless ($self->{tools}->{$name}->{type} eq 'module');
		return 0 unless (@{$self->{tools}->{$name}->{spec}});
		my $package = $self->{tools}->{$name}->{spec}->[0];
		my $eval = sprintf('$ok=(%s->can("Configure") && %s::Configure($self,$key,@val));',$package,$package);
		$self->dbg('Module eval: %s %s',$name,$eval);
		my $ok = 0;
		eval($eval);
		$self->isch("Module configure eval error: %s ? %s",$package,$@) if ($@);
		return 0 unless ($ok);
	} else {
		return 0;
	}
	$self->inhibit_further_callbacks();
	return 1;
}

sub _read_mime {
	my ($self) = @_;
	return 0 unless ($self->{mime});
	$self->dbg('MIME database: %s',$self->{mime});
	my %types = ();
	my $cc = 0;
	my $fh = undef;
	if (opendir($fh,$self->{mime})) {
		my @bases = ();
		while (my $d = readdir($fh)) {
			push @bases, $d unless ($fh =~ /^\.+$/);
		}
		closedir($fh);
		foreach my $base (@bases) {
			$fh = undef;
			next unless (opendir($fh,$self->{mime}.'/'.$base));
			while (my $f = readdir($fh)) {
				next unless ($f =~ /^(.+?)\.xml$/i);
				my $type = lc("$base/$1");
				next unless ($type && $type !~ /^\s+$/);
				my $tryp = $type;
				$tryp =~ s/[^\/a-z0-9]+//g;
				$types{$type} = 1;
				next if ($self->{canon}->{$tryp});
				$self->{canon}->{$tryp} = $type;
				$cc ++;
			}
			closedir($fh);
		}
	}
	$fh = undef;
	if (open($fh,'<',$self->{mime}.'/globs2')) {
		while (my $l = <$fh>) {
			next if ($l =~ /^\s*#/);
			$l =~ s/[\r\n]+//gs;
			next unless ($l =~ /^\d+:(.+?):/);
			my $type = lc($1);
			next unless ($type && $type !~ /^\s+$/);
			my $tryp = $type;
			$tryp =~ s/[^\/a-z0-9]+//g;
			$types{$type} = 1;
			next if ($self->{canon}->{$tryp});
			$self->{canon}->{$tryp} = $type;
			$cc ++;
		}
		close($fh);
	}
	$fh = undef;
	if (open($fh,'<',$self->{mime}.'/globs')) {
		while (my $l = <$fh>) {
			next if ($l =~ /^\s*#/);
			$l =~ s/[\r\n]+//gs;
			next unless ($l =~ /^(.+?):/);
			my $type = lc($1);
			next unless ($type && $type !~ /^\s+$/);
			my $tryp = $type;
			$tryp =~ s/[^\/a-z0-9]+//g;
			$types{$type} = 1;
			next if ($self->{canon}->{$tryp});
			$self->{canon}->{$tryp} = $type;
			$cc ++;
		}
		close($fh);
	}
	$fh = undef;
	if (open($fh,'<',$self->{mime}.'/subclasses')) {
		while (my $l = <$fh>) {
			next if ($l =~ /^\s*#/);
			$l =~ s/[\r\n]+//gs;
			foreach my $type (split(/\s+/,lc($l))) {
				next unless ($type && $type !~ /^\s+$/);
				my $tryp = $type;
				$tryp =~ s/[^\/a-z0-9]+//g;
				next if ($self->{canon}->{$tryp});
				$self->{canon}->{$tryp} = $type;
				$cc ++;
			}
		}
		close($fh);
	}
	$fh = undef;
	if (open($fh,'<',$self->{mime}.'/aliases')) {
		while (my $l = <$fh>) {
			next if ($l =~ /^\s*#/);
			$l =~ s/[\r\n]+//gs;
			my @alst = split(/\s+/,lc($l));
			pop @alst while (@alst && $alst[$#alst] =~ /^\s*$/);
			shift @alst while (@alst && $alst[0] =~ /^\s*$/);
			next unless (@alst);
			my $typt = '';
			foreach my $type (@alst) {
				next unless ($type && $type !~ /^\s+$/);
				next unless ($types{$type});
				$typt = $type;
				last;
			}
			$typt = $alst[$#alst] ;
			foreach my $type (@alst) {
				next unless ($type && $type !~ /^\s+$/);
				my $tryp = $type;
				$tryp =~ s/[^\/a-z0-9]+//g;
				next if ($self->{canon}->{$tryp});
				$self->{canon}->{$tryp} = $typt;
				$cc ++;
			}
		}
		close($fh);
	}
	return $cc;
}

sub finish_parsing_end {
	my ($self,$pars) = @_;
	if ($self->{magic}) {
		eval{ use File::MimeInfo::Magic (); };
		$self->{magic} = $@ ? 0 : 1;
		$self->{magic} = File::MimeInfo::Magic->new() if ($self->{magic});
	}
	unless ($self->{mime}) {
		foreach my $bd (('/usr/local/share','/usr/share')) {
			next unless ((-d $bd) && ((-f "$bd/mime/globs") || (-f "$bd/mime/globs2") || (-f "$bd/mime/aliases")));
			$self->{mime} = "$bd/mime";
			last;
		}
	}
	$self->_read_mime;
	#foreach my $tool (values %{$self->{tools}}) {
	#	next if bad
	#	$tool->{ok} = 1;
	#}
	#foreach my $tool (keys %{$self->{tools}}) {
	#	delete $self->{tools}->{$tool} unless ($self->{tools}->{$tool}->{ok});
	#}
}

sub _get_canon {
	my ($self,$type) = @_;
	return $type unless ($type);
	my $tryp = $type;
	$tryp =~ s/[^\/a-z0-9]+//g;
	return $self->{canon}->{$tryp} if ($self->{canon}->{$tryp});
	return $self->{magic}->mimetype_canon($type) if ($self->{magic});
	return $type;
}

sub _get_magic {
	my ($self,$data) = @_;
	return undef unless ($data && $$data && $self->{magic});
	my $dath = IO::String->new($data);
	my $magt = $self->{magic}->magic($dath);
	$dath->close();
	return $magt;
}

sub _tmpfile {
	my ($object,$tmp,$err) = @_;
	unless ($$tmp) {
		if ($object->{file}) {
			$$tmp = $object->{file};
		} else {
			my ($path,$file) = Mail::SpamAssassin::Util::secure_tmpfile();
			if ($path && $file) {
				$$tmp = $path;
				print $file ${$object->{data}};
				$$err = 2 unless (close($file));
			} else {
				$$err = 1;
			}
			$$tmp = '#' if ($$err);
		}
	}
	return $$tmp;
}
sub _extract_external {
	my ($self,$object,$tool) = @_;
	my $ok = 0;
	my ($extracted,$error);
	my @cmd = @{$tool->{spec}};
	my $tmp;
	my $err = 0;
	for (my $i=1;$i<@cmd;$i++) {
		$cmd[$i] =~ s/\$\{f(?:ile)?\}/_tmpfile($object,\$tmp,\$err)/gei;
		if ($err) {
			$self->isch('Temp file error!');
			return 0;
		}
	}
	my $sin;
	if ($tmp) {
		my $es = '';
		$sin = \$es;
	} else {
		if ($object->{file} && !defined($object->{data})) {
			my $fh;
			return 0 unless (open($fh,'<',$object->{file}));
			my $fd = join('',<$fh>);
			close($fh);
			$object->{data} = \$fd;
		}
		$sin = $object->{data};
	}
	$self->dbg('External call: %s "%s"',$tool->{name},join('","',@cmd));
	eval { $ok = run3(\@cmd,$sin,\$extracted,\$error); };
	my $ret = $?;
	if ($ret || !$ok || $error) {
		$error = '?' unless ($error);
		$error =~ s/^[\s\r\n]+//s;
		$error =~ s/[\s\r\n]+$//s;
		$error =~ s/[\r\n]+/; /gs;
		$error =~ s/\s+/ /g;
		$self->info('External extraction command: "%s"',join('","',@cmd));
		$self->info('External extraction object: %s %s "%s"',$object->{data}?length($object->{data}):'-',$object->{type},$object->{name});
		$self->info('External extraction error: %s %u %s',$tool->{name},$ret,$error);
	}
	unlink($tmp) if ($tmp && !$object->{file});
	return 0 if ($ret || !$ok || ($error && !$extracted));
	return (1,$extracted);
}

sub _extract_module {
	my ($self,$object,$tool) = @_;
	my $package = $tool->{spec}->[0];
	return 0 unless ($self->{modul}->{$package});
	my ($extracted,$error,$newobjects);
	my $eval = sprintf('($error,$extracted,$newobjects)=%s::Extract($self,$object);',$package);
	$self->dbg('Module eval: %s %s',$tool->{name},$eval);
	eval($eval);
	my $ret = $@;
	$self->info("Module extraction eval error: %s ? %s",$package,$@) if ($ret);
	$self->info('Module extraction object: %s %s "%s"',$object->{data}?length($object->{data}):'-',$object->{type},$object->{name}) if ($ret || $error);
	$self->info("Module extraction error: %s %s ? %s",$tool->{name},$package,$error) if ($error);
	return 0 if ($ret || $error);
	return (1,$extracted,$newobjects);
}

sub _extract_object {
	my ($self,$object,$tool) = @_;
	my ($ok,$extracted,$objects);
	if ($tool->{type} eq 'external') {
		($ok,$extracted,$objects) = $self->_extract_external($object,$tool);
	} elsif ($tool->{type} eq 'module') {
		($ok,$extracted,$objects) = $self->_extract_module($object,$tool);
	} else {
		$self->isch('Bad tool type:',$tool->{type});
		return 0;
	}
	return 0 unless ($ok);
	if ($tool->{charset}) {
		my $chrs;
		if (lc($tool->{charset}) eq '<xml>') {
			$chrs = ($extracted =~ /<\?xml(?: [^>]*?)*? encoding="([^"]+)"/i) ? $1 : 'detect';
		} else {
			$chrs = $tool->{charset};
		}
		eval{ $extracted=decode($chrs,$extracted); };
		$self->dbg('Decode failed: %s',$@) if ($@);
	}
	if ($tool->{comrexp}) {
		foreach my $comrexp (@{$tool->{comrexp}}) {
			$extracted =~ s#$comrexp##gsi;
		}
	}
	$extracted = '' if ($extracted =~ /^[\s\r\n]*$/s);
	if (defined($extracted) && $extracted ne '') {
		$self->info('Extracted %u chars using %s',length($extracted),$tool->{name});
		foreach my $l (split(/[\r\n]+/,$extracted)) {
			next unless ($l =~ /\S/);
			$self->{text}
				? $self->info('Text: %s',encode('ISO-8859-1',$l))
				: $self->dbg('Text: %s',encode('ISO-8859-1',$l));
		}
	} else {
		$self->info('No text extracted');
	}
	return (1,$extracted,$objects);
}

sub _get_type {
	my ($self,$object,$norec) = @_;
	my ($type,$mtype);
	if ($object->{type}) {
		$mtype = $self->_get_canon($object->{type});
		$mtype = $object->{type} unless ($mtype);
		$type = $mtype unless ($mtype eq 'application/octet-stream');
	};
	if (!$type && $self->{magic}) {
		$type = $self->{magic}->globs($object->{name}) if ($object->{name});
		$type = $self->{magic}->globs($object->{file}) if (!$type && $object->{file});
	}
	$type = $mtype if (!$type && $mtype);
	return $type ? ($type) : ();
}
sub _get_extension {
	my ($self,$object) = @_;
	my $fext;
	if ($self->{magic} && $object->{type} && $object->{type} ne 'application/octet-stream') {
		$fext = $self->{magic}->extensions($object->{type});
		$fext =~ s/^\.// if ($fext);
	}
	if (!$fext && $object->{name} && $object->{name} =~ /\.([^.\\\/]+)$/) {
		$fext = $1;
	}
	if (!$fext && $object->{file} && $object->{file} =~ /\.([^.\\\/]+)$/) {
		$fext = $1;
	}
	return $fext ? ($fext) : ();
}

sub _extract {
	my ($self,$coll,$part,$type,$name,$data,$tool) = @_;
	my $object = {
		data	=> $data,
		type	=> $type,
		name	=> $name,
		depth	=> 1,
	};
	my @types = $self->_get_type($object);
	my @fexts = $self->_get_extension($object);
	my @tools = ($tool->{name});
#	$part->{ExtractText_Decoded_Size} = length($$data);
	my ($ok,$extracted,$objects) = $self->_extract_object($object,$tool);
	return 0 unless ($ok);
	my $text = (defined($extracted)) ? $extracted : '';
	while (defined($objects) && @{$objects}) {
		$object = shift @{$objects};
		next if ($object->{depth} >= $self->{depth});
		if ($object->{file} && !$object->{name}) {
			$object->{name} = $object->{file};
			$object->{name} =~ s/^.*[\\\/]//;
		}
		if ($self->{magic}) {
			$object->{type} = $self->{magic}->globs($object->{name}) if (!$object->{type} && $object->{name});
			$object->{type} = $self->_get_magic($object->{data}) if (!$object->{type} && $object->{data});
			$object->{type} = $self->{magic}->magic($object->{file}) if (!$object->{type} && $object->{file});
			if (!$object->{name} && $object->{type}) {
				my $ext = $self->{magic}->extensions($object->{type});
				$object->{name} = 'ExtractTextExtracted'.$ext if ($ext);
			}
		}
		push @types, $self->_get_type($object);
		push @fexts, $self->_get_extension($object);
		if ($object->{type} || $object->{name}) {
			$self->dbg('Object: %s %s',$object->{type},$object->{name});
			my %checked = ();
			foreach my $match (@{$self->{match}}) {
				next unless ($self->{tools}->{$match->{tool}});
				next if ($checked{$match->{tool}});
				if ($match->{where} eq 'name') {
					next unless (defined($object->{name}) && $object->{name} =~ m#^$match->{what}$#i);
					$self->dbg('Match: name "%s" =~ "%s"',$object->{name},$match->{what});
				} elsif ($match->{where} eq 'type') {
					next unless (defined($object->{type}) && $object->{type} =~ m#^$match->{what}$#i);
					$self->dbg('Match: type "%s" =~ "%s"',$object->{type},$match->{what});
				} else {
					next;
				}
				my $moreobjects;
				$checked{$match->{tool}} = 1;
				($ok,$extracted,$moreobjects) = $self->_extract_object($object,$self->{tools}->{$match->{tool}});
				next unless ($ok);
				push @tools, $self->{tools}->{$match->{tool}}->{name};
				$text .= $extracted if (defined($extracted));
				if ($moreobjects) {
					foreach my $mobj (@{$moreobjects}) {
						$mobj->{depth} = $object->{depth} + 1;
					}
					push @{$objects}, @{$moreobjects};
				}
			}
		}
		unlink($object->{file}) if ($object->{file});
	}
	if ($text eq '') {
		push @{$coll->{flags}}, join('_',$tool->{name},'NoText');
	} else {
		$coll->{chars} += length($text);
		$coll->{words} += scalar @{[split(/\W+/s,$text)]} - 1;
		$part->set_rendered($text) ;
	}
#	$part->{ExtractText_Text_Size} = length($text);
	if (@types) {
		push @{$coll->{types}}, join(', ',@types) ;
#		$part->{ExtractText_Types} = [@types];
#		$part->{ExtractText_Type} = $types[0];
	}
	if (@fexts) {
		push @{$coll->{extensions}}, join('_',@fexts) ;
#		$part->{ExtractText_Extensions} = [@fexts];
#		$part->{ExtractText_Extension} = $fexts[0];
	}
	push @{$coll->{tools}}, join('_',@tools) ;
#	$part->{ExtractText_Tools} = [@tools];
#	$part->{ExtractText_Extension} = $tools[0];
	return 1;
}

sub _check_extract {
	my ($self,$coll,$checked,$part,$decoded,$data,$type,$name) = @_;
	return 0 unless (defined($type) || defined($name));
	foreach my $match (@{$self->{match}}) {
		next unless ($self->{tools}->{$match->{tool}});
		next if ($checked->{$match->{tool}});
		if ($match->{where} eq 'name') {
			next unless (defined($name) && $name =~ m#^$match->{what}$#i);
			$self->dbg('Match: name "%s" =~ "%s"',$name,$match->{what});
		} elsif ($match->{where} eq 'type') {
			next unless (defined($type) && $type =~ m#^$match->{what}$#i);
			$self->dbg('Match: type "%s" =~ "%s"',$type,$match->{what});
		} else {
			next;
		}
		unless ($$decoded) {
			$$data = $part->decode();
			$$decoded = 1;
		}
		last unless ($$data);
		$checked->{$match->{tool}} = 1;
		return 1 if ($self->_extract($coll,$part,$type,$name,$data,$self->{tools}->{$match->{tool}}));
	}
	return 0;
}

sub _put_metadata {
	my ($self,$msg,$name,$value) = @_;
	$msg->put_metadata($name,$value);
	$self->dbg('%s: %s',$name,$value);
}

sub post_message_parse {
	my ($self,$pars) = @_;
	my $msg = $pars->{'message'};
	return 0 unless ($msg);
	if ($self->{msgid}) {
		my $cmid = $msg->get_pristine_header('Message-ID');
		$self->{curmid} = $cmid ? $cmid : '-';
		$self->{curmid} =~ s/[\s\r\n]+//s;
	}
	my %collect = (
		tools		=> [],
		types		=> [],
		extensions	=> [],
		flags		=> [],
		chars		=> 0,
		words		=> 0,
	);
	foreach my $part ($msg->find_parts('.*',1,1)) {
		next unless ($part->is_leaf);
		my ($rmt,$rtd) = $part->rendered;
		next if (defined($rtd));
		my %checked = ();
		my $dat = undef;
		my $dec = 0;
		my $typ = $part->{type};
		my $nam = $part->{name};
		$self->dbg('Part: %s %s',$typ,$nam);
		next if ($self->_check_extract(\%collect,\%checked,$part,\$dec,\$dat,$typ,$nam));
		my $mag = $self->_get_canon($typ);
		if ($mag && $mag ne $typ) {
			$self->dbg('Canon: %s',$mag);
			next if ($self->_check_extract(\%collect,\%checked,$part,\$dec,\$dat,$mag));
		}
		$mag = $part->{magic_mime_type};
		if (!$mag && $self->{magic}) {
			unless ($dec) {
				$dat = $part->decode();
				$dec = 1;
			}
			if ($dat) {
				$mag = $self->_get_magic(\$dat);
				$part->{magic_mime_type} = $mag if ($mag);
			}
		}
		if ($mag && $mag ne $typ) {
			$self->dbg('Magic: %s',$mag);
			next if ($self->_check_extract(\%collect,\%checked,$part,\$dec,\$dat,$mag));
		}
		next if ($self->_check_extract($msg,$part,\$dec,\$dat,'',''));
		$self->dbg('Not extracted');
	}
	$self->_put_metadata($msg,'X-ExtractText-Words',$collect{words});
	$self->_put_metadata($msg,'X-ExtractText-Chars',$collect{chars});
	$self->_put_metadata($msg,'X-ExtractText-Tools',join(' ',@{$collect{tools}})) if (@{$collect{tools}});
	$self->_put_metadata($msg,'X-ExtractText-Types',join('; ',@{$collect{types}})) if (@{$collect{types}});
	$self->_put_metadata($msg,'X-ExtractText-Extensions',join(' ',@{$collect{extensions}})) if (@{$collect{extensions}});
	$self->_put_metadata($msg,'X-ExtractText-Flags',join(' ',@{$collect{flags}})) if (@{$collect{flags}});
	return 1;
}

sub parsed_metadata {
	my ($self,$pars) = @_;
	my $pms = $pars->{permsgstatus};
	return 0 unless ($pms);
	my $msg = $pms->get_message;
	return 0 unless ($msg);
	foreach my $tag (('Words','Chars','Tools','Types','Extensions','Flags')) {
		my $v = $msg->get_metadata("X-ExtractText-$tag");
		$pms->set_tag("ExtractText$tag",defined($v)?$v:'');
		#$self->dbg('Tag: %s=%s',"ExtractText$tag",$v);
	}
	return 1;
}

1;

(2009-07-10)