Whatever

SA: ExtractText.cf

# $Id: ExtractText.cf,v 1.8 2009/07/10 13:49:37 jonas Exp $

extracttext_mime_magic		yes
#extracttext_log_to_stderr	yes
#extracttext_log_msgid		yes
#extracttext_log_text		yes

#add_header			all	ExtractText     _EXTRACTTEXTCHARS_ _EXTRACTTEXTWORDS_ <_EXTRACTTEXTFLAGS_> [_EXTRACTTEXTTOOLS_] {_EXTRACTTEXTEXTENSIONS_} (_EXTRACTTEXTTYPES_)

extracttext_external	antiword	{CS:UTF-8} /usr/local/bin/antiword -t -w 0 -m UTF-8.txt -
extracttext_use		antiword	.doc .dot application/(?:vnd\\.?)?ms-?word.*

extracttext_module	openxml		Mail::SpamAssassin::Plugin::ExtractText::OpenXML
extracttext_use		openxml		.docx .dotx .dotm application/(?:vnd\\.?)openxml.*?word.*
extracttext_use		openxml		.doc .dot application/(?:vnd\\.?)?ms-?word.*

extracttext_external	unrtf		{CS:UTF-8} {CF:<{\\[:=-.*?-=:\\]}>} /usr/local/bin/unrtf -t ExtractText.tags --nopict
extracttext_use		unrtf		.doc .rtf application/rtf text/rtf

extracttext_external	odt2txt		{CS:UTF-8} {CF:\\[--\\s+\\S+:\\s.*?--\\]} /usr/local/bin/odt2txt --encoding=UTF-8 ${file}
extracttext_use		odt2txt		.odt .ott application/.*?opendocument.*text
extracttext_use		odt2txt		.sdw .stw application/(?:x-)?soffice application/(?:x-)?starwriter

extracttext_external	pdftohtml	{CS:} {CF:<.*?>} /usr/local/bin/pdftohtml -i -xml -stdout -noframes ${file}
extracttext_external	pdftotext	{CS:UTF-8} /usr/local/bin/pdftotext -q -nopgbrk -enc UTF-8 ${file} -
extracttext_use		pdftotext	.pdf application/pdf

header			PDF_NO_TEXT	X-ExtractText-Flags =~ /pdftohtml_NoText/
describe		PDF_NO_TEXT	PDF without text
score			PDF_NO_TEXT	0.25

header			DOC_NO_TEXT	X-ExtractText-Flags =~ /(?:antiword|openxml|unrtf|odt2txt)_NoText/
describe		DOC_NO_TEXT	Document without text
score			DOC_NO_TEXT	0.5

(2009-07-10)