Whatever

sa: URLRedirect.pm

=head1 NAME

URLRedirect - updates metadata with redirects for specified sites.

=head1 SYNOPSIS

	loadplugin Mail::SpamAssassin::Plugin::URLRedirect /usr/local/etc/mail/spamassassin/plugins/URLRedirect.pm
  
	urlredirect_max_recursion         2
	urlredirect_max_requests          20
	urlredirect_parallel_requests     10
	urlredirect_timeout               10
	urlredirect_cache                 yes

	urlredirect_dnslist               shorturl.junkemailfilter.com

	urlredirect_hostpath              tinyurl.com
	urlredirect_subdomain             notlong.com
	urlredirect_domain                fulpaziy.cn
	urlredirect_match                 (http://www.fulpaziy.cn/\?[a-z]+=[^&]+).*
	
	urlredirect_hostpath_file         URLRedirect.hostpath
	urlredirect_subdomain_file        URLRedirect.subdomain

	header    URL_REDIRECT            eval:urlredirect_redirect()
	header    URL_REDIRECTS           eval:urlredirect_redirects(3)
	header    URL_REDIRECTOR          eval:urlredirect_redirector()
	header    URL_REDIRECTORS         eval:urlredirect_redirectors(3)
	header    URL_REDIRECT_RECURSE    eval:urlredirect_recursion()
	header    URL_REDIRECT_RECURSE_2  eval:urlredirect_recursions(2)
	header    URL_REDIRECT_TOSAME     eval:urlredirect_tosamedomain()

	describe  URL_REDIRECT            URL redirection
	describe  URL_REDIRECTOR          URL redirector
	describe  URL_REDIRECTORS         Multiple URL redirectors
	describe  URL_REDIRECT_RECURSE    recursive URL redirection
	describe  URL_REDIRECT_RECURSE_2  recursive recursive URL redirection
	describe  URL_REDIRECT_TOSAME     URL redirects to same domain

	score     URL_REDIRECT            0.001
	score     URL_REDIRECTOR          0.001
	score     URL_REDIRECTORS         0.1
	score     URL_REDIRECT_RECURSE    0.5
	score     URL_REDIRECT_RECURSE_2  1.0
	score     URL_REDIRECT_TOSAME     0.25

=head1 DESCRIPTION

This module follows URLs (in parallel, using HEAD requests) matching specifications and adds the
location of redirections to metadata (so that the "real" sites are checked by
URIBLs and other rules).

=head1 REQUIREMENT

=over

=item *
SpamAssassin

=item *
LWP

=back

=head1 CONFIGURATION

=head2 Options

=over

=item urlredirect_max_recursion

The maximum times after the first check that the plugin will loop through the
URI metadata to follow URLs. Defaults to 0.

=item urlredirect_max_requests

The maximum number of HTTP requests do be done for one message.

=item urlredirect_parallel_requests

The maximum number of HTTP requests do be done in parallel.

=item urlredirect_timeout

The maximum time (in seconds) to spend checking on one message.

=item urlredirect_cache

Wether lookup and request results should be kept in memory between scans.
This is currently done in a per instance manner.

=item urlredirect_dnslist

The base name for a simple DNS based list of URL redirectors.

The list is assumed to return any A record for URL redirectors formated as
<url fqdn>.<dnsl base name>

=item urlredirect_*_file

Read redirector specifiers from a file, one specifier/line.

The files are searched for in the site config directory and the directory the
plugin resides in.

This is available for the stuff below as follows:

=over

=item *
urlredirect_domain

=item *
=item urlredirect_subdomain

=item *
=item urlredirect_host

=item *
=item urlredirect_hostpath

=item *
=item urlredirect_match

=back

=item urlredirect_domain

Defines a domain, the addresses of witch should be checked for redirects.

=item urlredirect_subdomain

Defines a domain for wich subdomains should be redirects.
Note: www.domain is excluded from this.

=item urlredirect_host

Defines a host address for wich redirects should be checked.

=item urlredirect_hostpath

Defines a host address for wich redirects should be checked if there is a
path following the host.

=item urlredirect_match

Defines a regular expression wich when matching a URL will trigger redirect
checking. The above options can be seen as shortcuts for this one.

Enclosing a part of the expression in () will result in the captured string
beeing used for the query.

=back

=head2 Rules

A couple of eval tests are provided for use in SA rules.

=item urlredirect_redirects(count) / urlredirect_redirect()

Returns true if count or more redirects occured.

Count deaults to 1.

=item urlredirect_recursions(depth) / urlredirect_recursion()

Returns true if any URL following recursed depth or more times.

Depth defaults to 1.

=item urlredirect_redirectors(count) / urlredirect_redirector()

Returns true if count or more redirector URLs was found.

Note that this does not mean that the URL redirected, only that it was
identified as a redirector by either configuration or a DNSL.

Count deaults to 1.

=item urlredirect_tosamedomain(count) / urlredirect_tosamedomain()

Returns true if count or more redirects redirected back to the same domain.

Count deaults to 1.

=head2 Notes

All defined domains/hosts/matches are concatenated into one regular expression.
By using urlredirect_match you can manually combine addresses into a more
compact expression than the one othwerwise created.

=head3 Equivalence examples:

	# Match for a host:
	urlredirect_host      example.com
	urlredirect_match     ^((?:example\.com)(?:[/?].*)?)$

	# Match for a host with subdir/file:
	urlredirect_hostpath  example.com
	urlredirect_match     ^((?:example\.com)[/?].+)$

	# Match for a domain:
	urlredirect_domain    example.com
	urlredirect_match     ((?:[^/?]+?\.)?(?:example\.com)(?:[/?].*)?)

	# Match for a domain with subdomains:
	urlredirect_domain    example.com
	urlredirect_match     ((?:(?!www)[^/?]{3}|[^/?]{1,2}|[^/?]{4,}?)\.(?:example\.com)(?:[/?].*)?)
	
=cut

package Mail::SpamAssassin::Plugin::URLRedirect;

# $Id: URLRedirect.pm,v 1.22 2009/11/16 18:57:55 jonas Exp $

use strict;
use base 'Mail::SpamAssassin::Plugin';
use File::Spec;
use Taint::Util;
use URI;
use IO::Select;
use Net::HTTP::NB;
use HTTP::Response;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Util;
use Mail::SpamAssassin::Timeout;

sub new {
	my ($class,$mailsa) = @_;
	$class = ref($class) || $class;
	my $self = $class->SUPER::new($mailsa);
	bless($self,$class);
	$self->{domain} = [];
	$self->{subdomain} = [];
	$self->{host} = [];
	$self->{hostpath} = [];
	$self->{match} = [];
	$self->{dnsls} = [];
	$self->{timeout} = 10;
	$self->{cache} = 6*60*60;
	$self->{parallel_requests} = 10;
	$self->{max_requests} = 20;
	$self->{debug} = 1;
	$self->register_method_priority('parsed_metadata',-1);
	$self->register_eval_rule('urlredirect_redirect');
	$self->register_eval_rule('urlredirect_redirects');
	$self->register_eval_rule('urlredirect_redirection');
	$self->register_eval_rule('urlredirect_redirections');
	$self->register_eval_rule('urlredirect_redirector');
	$self->register_eval_rule('urlredirect_redirectors');
	$self->register_eval_rule('urlredirect_recursion');
	$self->register_eval_rule('urlredirect_recursions');
	$self->register_eval_rule('urlredirect_tosamedomain');
	$self->register_eval_rule('urlredirect_tosamedomains');
	$self->{plugin_path} = __FILE__;
	$self->{plugin_path} =~ s/[\\\/][^\\\/]+$//;
	$self->dbg('registered');
	return $self;
}

sub dbg {
	my $self = shift;
	#Mail::SpamAssassin::Plugin::dbg(sprintf("urlredirect dbg: %s",join('||',@_)));
	my $msg = shift;
	for (my $i=0;$i<@_;$i++) { $_[$i] = '' unless (defined($_[$i])); }
	if ($self->{debug}) {
		Mail::SpamAssassin::Plugin::info(sprintf("urlredirect: $msg",@_));
	} else {
		Mail::SpamAssassin::Plugin::dbg(sprintf("urlredirect: $msg",@_));
	}
	print STDERR sprintf("urlredirect: $msg\n",@_) if (defined($self) && $self->{stderr});
}

sub _from_files {
	my $self = shift;
	my $sing = shift;
	my $plur = shift;
	$plur = $sing.'s' unless ($plur);
	my @vals = ();
	while (@_) {
		my $file = shift @_;
		my $path = '';
		foreach my $dir (($self->{main}->{site_rules_filename},@Mail::SpamAssassin::site_rules_path,$self->{plugin_path})) {
			next unless ($dir);
			my $tst = File::Spec->rel2abs($file,$dir);
			next unless (-r $tst);
			$path = $tst;
			last;
		}
		unless ($path) {
			warn("Missing urlredirect '$plur' file: $file");
			next;
		}
		my $fh = undef;
		unless (open($fh,'<',$path)) {
			warn("Error reading urlredirect '$plur' file: $path");
			next;
		}
		$self->dbg('config file %s: "%s"',$plur,$path);
		while (my $l = <$fh>) {
			next if ($l =~ /^\s*[#;]/);
			$l =~ s/[\r\n]+//s;
			$l =~ s/^\s+//;
			$l =~ s/\s+$//;
			next unless ($l);
			$self->dbg('config file %s: %s',$sing,$l);
			push @vals, $l;
		}
		close($fh);
	}
	return @vals;
}

sub parse_config {
	my ($self,$pars) = @_;
	return 0 if ($pars->{user_config});
	return 0 unless ($pars->{key} =~ /^urlredirect_(.+)$/i);
	my $key = lc($1);
	my @val = split(/\s+/,$pars->{value});
	return 0 unless (@val);
	if ($key =~ /^max_?recursions?$/i) {
		$self->dbg('config: %s=%s','max_recursion',$val[0]);
		$self->{max_recursion} = $val[0];
	} elsif ($key =~ /^max_?requests?$/i) {
		$self->dbg('config: %s=%s','max_requests',$val[0]);
		$self->{max_requests} = $val[0];
	} elsif ($key =~ /^par(?:all?ell?)?_?requests?$/i) {
		$self->dbg('config: %s=%s','parallel_requests',$val[0]);
		$self->{parallel_requests} = $val[0];
	} elsif ($key =~ /^time-?out$/i) {
		$self->dbg('config: %s=%s','timeout',$val[0]);
		$self->{timeout} = $val[0];
	} elsif ($key =~ /^cache$/i) {
		$self->dbg('config: %s=%s','cache',$val[0]);
		$self->{cache} = ($val[0] && ($val[0] !~ /^\d+$/)) ? 6*60*60 : $val[0] * 60;
	} elsif ($key =~ /^domains?$/i) {
		$self->dbg('config: %s=%s','domain(s)',join(' ',@val));
		push @{$self->{domain}}, @val;
	} elsif ($key =~ /^domains?[-_]?files?$/i) {
		push @{$self->{domain}}, $self->_from_files('domain','',@val);
	} elsif ($key =~ /^subdomains?$/i) {
		$self->dbg('config: %s=%s','subdomain(s)',join(' ',@val));
		push @{$self->{subdomain}}, @val;
	} elsif ($key =~ /^subdomains?[-_]?files?$/i) {
		push @{$self->{subdomain}}, $self->_from_files('subdomain','',@val);
	} elsif ($key =~ /^hosts?$/i) {
		$self->dbg('config: %s=%s','host(s)',join(' ',@val));
		push @{$self->{host}}, @val;
	} elsif ($key =~ /^hosts?[-_]?files?/i) {
		push @{$self->{host}}, $self->_from_files('host','',@val);
	} elsif ($key =~ /^hosts?paths?$/i) {
		$self->dbg('config: %s=%s','host(s)path(s)',join(' ',@val));
		push @{$self->{hostpath}}, @val;
	} elsif ($key =~ /^hosts?paths?[-_]?files?/i) {
		push @{$self->{hostpath}}, $self->_from_files('hostpath','hostspaths',@val);
	} elsif ($key =~ /^matche?s?$/i) {
		$self->dbg('config: %s=%s','match(es)',join(' ',@val));
		push @{$self->{match}}, @val;
	} elsif ($key =~ /^matche?s?[-_]?files?$/i) {
		push @{$self->{match}}, $self->_from_files('match','matches',@val);
	} elsif ($key =~ /^dnsl(?:ist)?s?$/i) {
		$self->dbg('config: %s=%s','dnslist(s)',join(' ',@val));
		push @{$self->{dnsls}}, @val;
	} elsif ($key =~ /^clear$/i) {
		$self->dbg('config: %s=%s','clear',join(' ',@val));
		foreach my $cl (@val) {
			if ($cl =~ /^domains?$/i) {
				$self->{domain} = [];
			} elsif ($cl =~ /^subdomains?$/i) {
				$self->{subdomain} = [];
			} elsif ($cl =~ /^hosts?$/i) {
				$self->{host} = [];
			} elsif ($cl =~ /^hosts?paths?$/i) {
				$self->{hostpath} = [];
			} elsif ($cl =~ /^matche?s?$/i) {
				$self->{match} = [];
			} elsif ($cl =~ /^all$/i) {
				$self->{domain} = [];
				$self->{subdomain} = [];
				$self->{host} = [];
				$self->{hostpath} = [];
				$self->{match} = [];
			}
		}
	} else {
		return 0;
	}
	$self->inhibit_further_callbacks();
	return 1;
}

sub _clean_list {
	my ($self,$list) = @_;
	my %hash = ();
	foreach my $element (@{$list}) {
		$hash{$element} = 1 unless ($hash{$element});
	}
	return keys %hash;
}

sub finish_parsing_end {
	my ($self,$pars) = @_;
	$self->dbg('No redirectors!') unless ($self->_regex || @{$self->{dnsls}});
}

sub _regex {
	my ($self) = @_;
	return $self->{_regex} if (defined($self->{_regex}));
	my @rl = ();
	my @ml = ();
	my $rs;
	if (@{$self->{match}}) {
		$rs = join('|',$self->_clean_list($self->{match}));
		$rs =~ s/([<>])/\\$1/g;
		push @rl, sprintf('(?:%s)',$rs);
	}
	if (@{$self->{hostpath}}) {
		$rs = join('|',$self->_clean_list($self->{hostpath}));
		$rs =~ s/([.<>])/\\$1/g;
		push @ml, sprintf('(?:%s)[/?].+',$rs);
	}
	if (@{$self->{host}}) {
		$rs = join('|',$self->_clean_list($self->{host}));
		$rs =~ s/([.<>])/\\$1/g;
		push @ml, sprintf('(?:%s)(?:[/?].*)?',$rs);
	}
	if (@{$self->{subdomain}}) {
		$rs = join('|',$self->_clean_list($self->{subdomain}));
		$rs =~ s/([.<>])/\\$1/g;
		push @ml, sprintf('(?:(?!www)[^/?]{3}|[^/?]{1,2}|[^/?]{4,}?)\.(?:%s)(?:[/?].*)?',$rs);
	}
	if (@{$self->{domain}}) {
		$rs = join('|',$self->_clean_list($self->{domain}));
		$rs =~ s/([.<>])/\\$1/g;
		push @ml, sprintf('(?:[^/?]+?\.)?(?:%s)(?:[/?].*)?',$rs);
	}
	push @rl, sprintf('^(https?://(?:%s))$',join('|',@ml)) if (@ml);
	unless (@rl) {
		$self->{_regex} = 0;
		return 0;
	}
	$rs = join('|',@rl);
	$self->{_regex} = qr<$rs>; #<
	$self->dbg('regex: %s',$self->{_regex});
	return $self->{_regex};
}

sub compile_now_start {
	my ($self) = @_;
	$self->_regex();
}

sub _timegone {
	my ($self,$pms) = @_;
	my $lt = time() - $pms->{urlredirect_start};
	return ($lt >= 0) ? $lt : '-';
}

sub _timeleft {
	my ($self,$pms,$min) = @_;
	my $lt = ($pms->{urlredirect_start} + $self->{timeout}) - time();
	return ($min && $lt < $min) ? $min : $lt;
}

sub _timeout {
	my ($self,$pms) = @_;
	return 0 if ($self->_timeleft($pms) > 0);
	$self->dbg('normal timeout');
	return 1;
}

sub _handle_dns {
	my ($self,$reply,$id) = @_;
	$self->{dnsans} ++;
	return unless ($self->{dnscol} && (exists $self->{dnsids}) && (exists $self->{dnsads}));
	return unless (exists $self->{dnsids}->{$id});
	my $adr = $self->{dnsids}->{$id}->{adr};
	delete $self->{dnsids}->{$id};
	return unless (exists $self->{dnsads}->{$adr});
	foreach my $rr ($reply->answer) {
		next unless ($rr->type eq 'A');
		my $aa = $rr->address;
		$self->dbg('dns result: id=%s adr=%s ans=%s',$id,$adr,$aa);
		$self->{dnsads}->{$adr}->{res} = $aa;
		return;
	}
	return if (defined($self->{dnsads}->{$adr}->{res}));
	$self->{dnsads}->{$adr}->{res} = 0;
	$self->dbg('dns result: id=%s adr=%s ans=%s',$id,$adr,'nothing');
}

sub _query_dns {
	my ($self,$adr) = @_;
	$adr = lc($adr);
	return $self->{dnsads}->{$adr}->{id} if ($self->{dnsads}->{$adr});
	return 0 unless ($self->{follow});
	my $id = $self->{main}->{resolver}->bgsend($adr,'A',undef,sub { $self->_handle_dns(@_); });
	return 0 unless ($id);
	$self->{dnstot} ++;
	my $exp = time() + $self->{cache};
	$self->{dnsads}->{$adr} = {id=>$id,ts=>$exp};
	$self->{dnsids}->{$id} = {adr=>$adr,ts=>$exp};
	$self->dbg('dns query: adr=%s id=%s',$adr,$id);
	return $id;
}

sub _poll_http_responses {
	my ($self) = @_;
	my $rc = 0;
	foreach my $con (values %{$self->{urmcon}}) {
		next if (defined($con->{res}));
		next unless ($con->{con} && $con->{sel});
		my $ex = $con->{sel}->has_exception(0);
		my $cw = $con->{sel}->can_write(0);
		if ($con->{sel}->can_read(0)) {
			my ($code,$msg,@head);
			eval { ($code,$msg,@head) = $con->{con}->read_response_headers; };
			if ($code) {
				$self->dbg('http response: url=%s hrc=%s hrm=%s',$con->{url},$code,$msg);
				my $res;
				eval { $res = HTTP::Response->new($code,$msg,\@head) ; };
				if ($res && $res->is_redirect && $res->header('location')) {
					$con->{res} = $res->header('location');
				} else {
					$con->{res} = 0;
				}
			}
		} elsif ($ex || !$cw) {
			$con->{res} = 0;
		}
		next unless (defined($con->{res}));
		$self->dbg('http result: url=%s res=%s',$con->{url},$con->{res});
		eval { $con->{con}->close(); };
		delete $con->{con};
		delete $con->{sel};
		$self->{concur} --;
		$rc ++;
	}
	return $rc;
}

sub _parse_uri {
	my ($self,$url) = @_;
	my $uri; eval {	$uri = URI->new($url); };
	return undef unless ($uri);
	unless ($uri && $uri->scheme) {
		return undef unless ($url =~ /^[a-zA-Z0-9][-_.a-zA-Z0-9]*\.[a-zA-Z0-9]+(?:\/.*?)?$/);
		$uri = undef; eval { $uri = URI->new("http://$url"); };
	}
	return undef unless ($uri && $uri->scheme && lc($uri->scheme) eq 'http');
	my $ok; eval{ $ok = 1 if ($uri->host); };
	return undef unless ($ok);
	#$self->dbg('parsed url: url=%s sch=%s hst=%s prt=%s pth=%s',$url,$uri->scheme,$uri->host,$uri->port,$uri->path_query);
	return $uri;
}

sub _find_urls {
	my ($self,@lst) = @_;
	my @infs = ();
	foreach my $inf (@lst) {
		my $fnd;
		foreach my $url (@{$inf->{cleaned}}) {
			#$self->dbg('url: %s',$url);
			my $uri = $self->_parse_uri($url);
			next unless ($uri);
			unless ($fnd) {
				my $nam = "$inf";
				$nam =~ s/^HASH\((.*?)\)$/$1/;
				$fnd = {inf=>$inf,chk=>[],nam=>$nam};
				#$self->dbg('found inf: %s',$nam);
			}
			my $chk = {uri=>$uri,url=>$url,urm=>$url,rec=>0,cnt=>$#{$inf->{chk}}+2,state=>'m'};
			$self->dbg('found url: url=%s rec=%i cnt=%i',$url,0,$chk->{cnt});
			push @{$fnd->{chk}}, $chk;
		}
		next unless ($fnd);
		push @infs, $fnd;
	}
	return @infs;
}

sub _chk_fail {
	my ($self,$chk,$info) = @_;
	$self->dbg('chk fail: %s %s',$chk->{state},$info);
	return '?';
}

sub _chk_done {
	my ($self,$chk,$info) = @_;
	$self->dbg('chk done: %s %s',$chk->{state},$info);
	return '!';
}

sub _chk_dbg {
	my ($self,$inf,$chk,$state) = @_;
	return if ($chk->{state} eq $state);
	if ($chk->{state} eq 'l') {
		my @al = ();
		foreach my $id (@{$chk->{ids}}) {
			push @al, $id->{adr};
		}
		$self->dbg('chk state: %sL inf=%s chk=%i urm=%s lc=%i la=%s',$state,$inf,$chk->{cnt},$chk->{urm},scalar @al,join(' ',@al));
	} elsif ($chk->{state} eq 'f') {
		$self->dbg('chk state: %sF inf=%s chk=%i urm=%s',$state,$inf,$chk->{cnt},$chk->{urm});
	} elsif ($chk->{state} eq 'r') {
		$self->dbg('chk state: %sR inf=%s chk=%i urm=%s hst=%s',$state,$inf,$chk->{cnt},$chk->{urm},$chk->{hst});
	} elsif ($chk->{state} eq 's') {
		$self->dbg('chk state: %sS inf=%s chk=%i urm=%s adr=%s',$state,$inf,$chk->{cnt},$chk->{urm},$chk->{adr});
	} elsif ($chk->{state} eq 'c') {
		$self->dbg('chk state: %sC inf=%s chk=%i urm=%s adr=%s',$state,$inf,$chk->{cnt},$chk->{urm},$chk->{adr});
	} elsif ($chk->{state} eq 'h') {
		$self->dbg('chk state: %sH inf=%s chk=%i urm=%s loc=%s',$state,$inf,$chk->{cnt},$chk->{urm},($chk->{loc} && @{$chk->{loc}}) ? join(' ',@{$chk->{loc}}) : '-');
	} elsif ($chk->{state} eq '!') {
		$self->dbg('chk state: %sD inf=%s chk=%i urm=%s',$state,$inf,$chk->{cnt},$chk->{urm});
	} elsif ($chk->{state} eq '?') {
		my $msg = 'chk state: %sE inf=%s chk=%i urm=%s';
		my @msi = ($state,$inf,$chk->{cnt},$chk->{urm});
		my @al = ();
		foreach my $id (@{$chk->{ids}}) {
			push @al, $id->{adr};
		}
		if ($chk->{hst}) {
			$msg .= ' hst=%s';
			push @msi, $chk->{hst};
		}
		if (@al) {
			$msg .= ' lc=%i la=%s';
			push @msi, (scalar @al,join(' ',@al));
		}
		$self->dbg($msg,@msi);
	} else {
		$self->dbg('chk state: %s%s inf=%s chk=%i urm=%s',$state,uc($chk->{state}),$inf,$chk->{cnt},$chk->{urm});
	}
}

sub _chk_match {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_done($chk,'no uri') unless ($chk->{uri});
	my $urd = Mail::SpamAssassin::Util::uri_to_domain($chk->{url});
	my $urh = $chk->{uri}->host;
	$urd = lc($urd) if ($urd);
	$urh = lc($urh) if ($urh);
	$chk->{urd} = $urd;
	$chk->{urh} = $urh;
	return $self->_chk_fail($chk,'no domain or host') unless ($urd && $urh);
	my $urp;
	if (my @urp = split(/\./,$urh)) {
		shift @urp;
		$urp = join('.',@urp);
		$urp = '' if ((length($urp) <= length($urd)) || $urp eq $urh);
	}
	my $re = $self->_regex;
	if ($re && ($chk->{url} =~ $re)) {
		my $urm = $+;
		$urm = $chk->{url} unless ($urm);
		my $uri = $self->_parse_uri($urm);
		if ($uri) {
			$chk->{urm} = $urm;
			$chk->{uri} = $uri;
			$self->dbg('matched url: url=%s urm=%s',$chk->{url},$urm);
			$pms->{urlredirect_redirectors} ++;
			return 'f';
		}
	}
	return $self->_chk_done($chk,'no dns') unless (@{$self->{dnsls}} && $pms->is_dns_available());
	my %ads = ();
	my %ids = ();
	my %did = ();
	my %idi = ();
	foreach my $urx (($urd,$urh,$urp)) {
		next unless ($urx);
		next if ($idi{$urx});
		$idi{$urx} = 1;
		foreach my $dnsl (@{$self->{dnsls}}) {
			my $adr = "$urx.$dnsl";
			unless ($self->{dnsads}->{$adr}) {
				$self->dbg('dns ad: %s',$adr);
				$ads{$adr} = 1;
				next;
			}
			unless (defined($self->{dnsads}->{$adr}->{res})) {
				$self->dbg('dns id: %s',$adr);
				$ids{$adr} = 1;
				next;
			}
			$did{$adr} = 1;
			my $red = $self->{dnsads}->{$adr}->{res};
			$self->dbg('dns result: url=%s urx=%s red=%s',$chk->{url},$urx,$red);
			next unless ($red);
			$pms->{urlredirect_redirectors} ++;
			return 'f';
		}
	}
	return $self->_chk_done($chk,'no dnsls or queries') unless (%ads || %ids);
	my @ids = ();
	foreach my $adr (keys %ids) {
		next if ($did{$adr});
		$did{$adr} = 1;
		push @ids, {adr=>$adr};
	}
	if ($self->{follow}) {
		foreach my $adr (keys %ads) {
			next if ($did{$adr});
			$did{$adr} = 1;
			my $id = $self->_query_dns($adr);
			push @ids, {id=>$id,adr=>$adr} if ($id);
		}
	}
	return $self->_chk_fail($chk,'no queries') unless (@ids);
	$chk->{ids} = \@ids;
	return 'l';
}

sub _chk_lookup {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_fail($chk,'no queries') unless ($chk->{ids} && @{$chk->{ids}});
	$self->{main}->{resolver}->poll_responses(0);
	my $idl = 0;
	foreach my $ida (@{$chk->{ids}}) {
		unless (exists $self->{dnsads}->{$ida->{adr}}) {
			$self->dbg('dnsad gone: %s (%s)',$ida->{adr}?$ida->{adr}:'-',$ida->{id}?$ida->{id}:'-');
			next;
		}
		if (defined($self->{dnsads}->{$ida->{adr}}->{res})) {
			if ($self->{dnsads}->{$ida->{adr}}->{res}) {
				$pms->{urlredirect_redirectors} ++;
				return 'f';
			}
			next;
		}
		next if ($ida->{id} && !(exists $self->{dnsids}->{$ida->{id}}));
		$idl ++;
	}
	return $self->_chk_done($chk,'no queries') unless ($idl);
	return 'l';
}

sub _chk_follow {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_fail($chk,'no url') unless ($chk->{urm});
	$pms->{urlredirect_recursions} = $chk->{rec} if ($chk->{rec} > $pms->{urlredirect_recursions});
	if ($self->{urmred}->{$chk->{urm}} && defined($self->{urmred}->{$chk->{urm}}->{loc})) {
		$chk->{loc} = $self->{urmred}->{$chk->{urm}}->{loc};
		$pms->{urlredirect_redirects} ++;
		return 'h';
	}
	my $hst = $chk->{uri}->host;
	$chk->{hst} = $hst;
	if ($hst =~ /^[\d:.]+$/) {
		$chk->{adr} = $hst;
		return 's';
	}
	if ($self->{dnsads}->{$hst} && defined($self->{dnsads}->{$hst}->{res})) {
		return $self->_chk_fail($chk,'no redirect') unless ($self->{dnsads}->{$hst}->{res});
		$chk->{adr} = $self->{dnsads}->{$hst}->{res};
		return 's';
	}
	return $self->_chk_fail($chk,'end') unless ($self->{follow});
	my $id = $self->_query_dns($hst);
	return $self->_chk_fail($chk,'no query') unless ($id);
	$chk->{rid} = $id;
	return 'r';
}

sub _chk_resolve {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_fail($chk,'no query') unless ($chk->{rid});
	$self->{main}->{resolver}->poll_responses(0);
	if (defined($self->{dnsads}->{$chk->{hst}}->{res})) {
		return $self->_chk_fail($chk,'no redirect') unless ($self->{dnsads}->{$chk->{hst}}->{res});
		$chk->{adr} = $self->{dnsads}->{$chk->{hst}}->{res};
		return 's';
	}
	return $self->_chk_fail($chk,'query gone') unless (exists $self->{dnsids}->{$chk->{rid}});
	return 'r';
}

sub _chk_send {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_fail($chk,'no address') unless ($chk->{adr} && $chk->{hst} && $chk->{uri});
	return 'c' if ($self->{urmcon}->{$chk->{urm}});
	return $self->_chk_fail($chk,'end') unless ($self->{follow});
	return $self->_chk_fail($chk,'max requests') if ($self->{max_requests} && $self->{contot} >= $self->{max_requests});
	my $host = $chk->{uri}->host;
	my $port = $chk->{uri}->port ? $chk->{uri}->port : 80;
	my $path = $chk->{uri}->path_query;
	$path = "/$path" unless ($path =~ m#^/#);
	my $prot = lc($chk->{uri}->scheme);
	return $self->_chk_fail($chk,'not http') unless ($prot eq 'http');
	return 's' if ($self->{parallel_requests} && $self->{concur} >= $self->{parallel_requests});
	my $url = "http://$host:$port$path";
	my $con;
	eval {
		my $addr = $chk->{adr};
		untaint($host);
		untaint($addr);
		untaint($port);
		$con = Net::HTTP::NB->new(
			Host		=> $host,
			PeerAddr	=> $addr,
			PeerPort	=> $port,
			Timeout		=> $self->_timeleft($pms,1),
		);
	};
	return $self->_chk_fail($chk,'no connection') unless ($con);
	my $sel = IO::Select->new(); $sel->add($con);
	my $ok; eval { $ok = $con->write_request('HEAD'=>$path,'Host'=>$host,'User-Agent'=>'URLRedirect.pm'); };
	unless ($ok) {
		$con->close();
		return $self->_chk_fail($chk,'bad connection');
	}
	$self->{concur} ++;
	$self->{contot} ++;
	$self->{concax} = $self->{concur} if ($self->{concur} > $self->{concax});
	$self->{urmcon}->{$chk->{urm}} = {con=>$con,sel=>$sel,url=>$url,ts=>time()+$self->{cache}};
	return 'c';
}

sub _chk_check {
	my ($self,$pms,$chk) = @_;
	return $self->_chk_done($chk,'no connection') unless ($chk->{urm} && $self->{urmcon}->{$chk->{urm}});
	$self->_poll_http_responses();
	return 'c' unless (defined($self->{urmcon}->{$chk->{urm}}->{res}));
	return $self->_chk_done($chk,'no redirect') unless ($self->{urmcon}->{$chk->{urm}}->{res});
	my @loc = Mail::SpamAssassin::Util::uri_list_canonify($self->{conf}->{redirector_patterns},$self->{urmcon}->{$chk->{urm}}->{res});
	$self->{urmred}->{$chk->{urm}} = {loc=>\@loc,ts=>time()+$self->{cache}};
	$chk->{loc} = \@loc;
	$pms->{urlredirect_redirects} ++;
	return 'h';
}

sub _chk_handle {
	my ($self,$pms,$chk,$inf) = @_;
	return $self->_chk_fail($chk,'no location') unless ($chk->{loc});
	push @{$inf->{inf}->{cleaned}}, @{$chk->{loc}};
	foreach my $can (@{$chk->{loc}}) {
		if (!$self->{max_recursion} || $chk->{rec} < $self->{max_recursion}) {
			my $uri = $self->_parse_uri($can);
			if ($uri) {
				my $chkn = {uri=>$uri,url=>$can,urm=>$can,rec=>$chk->{rec}+1,cnt=>$#{$inf->{chk}}+2,state=>'m'};
				$self->dbg('found url: cnt=%i urm=%s url=%s rec=%i cnt=%i',$chk->{cnt},$chk->{urm},$can,$chkn->{rec},$chkn->{cnt});
				push @{$inf->{chk}}, $chkn;
			}
		}
		my $dom = Mail::SpamAssassin::Util::uri_to_domain($can);
		next unless ($dom);
		$pms->{urlredirect_samedomains} ++ if ($dom eq $chk->{urd});
		next if ($inf->{inf}->{domains}->{$dom});
		$inf->{inf}->{domains}->{$dom} = 1;
	}
	return $self->_chk_done($chk,'finished');
}

sub _fix_cache {
	my ($self,$init) = @_;
	my $now = time();
	my $dc = 0;
	my $kc = 0;
	if ($self->{urmcon}) {
		foreach my $con (keys %{$self->{urmcon}}) {
			if ($self->{urmcon}->{$con}->{con}) {
				eval { $self->{urmcon}->{$con}->{con}->close(); };
				delete $self->{urmcon}->{$con}->{con};
			}
			next unless ($self->{cache});
			delete $self->{urmcon}->{$con}->{sel} if (exists $self->{urmcon}->{$con}->{sel});
			delete $self->{urmcon}->{$con}->{con} if (exists $self->{urmcon}->{$con}->{con});
			next if (exists $self->{urmcon}->{$con}->{res});
			$dc ++;
			delete $self->{urmcon}->{$con};
		}
	}
	foreach my $cache (('urmcon','urmred','dnsids','dnsads')) {
		if ($self->{$cache}) {
			if ($self->{cache}) {
				foreach my $k (keys %{$self->{$cache}}) {
					if ($self->{$cache}->{$k}->{ts} && $self->{$cache}->{$k}->{ts} >= $now) {
						$kc ++;
					} else {
						$dc ++;
						delete $self->{$cache}->{$k};
					}
				}
			} elsif ($init) {
				%{$self->{$cache}} = ();
			} else {
				$dc += scalar keys %{$self->{$cache}};
				delete $self->{$cache};
			}
		} elsif ($init) {
			$self->{$cache} = {}
		}
	}
	$self->dbg('cache: %i %i',$kc,$dc) if ($kc || $dc);
}

sub _chk_work {
	my ($self,$pms,$chk,$inf) = @_;
	my $did = 0;
	if ($chk->{state} eq 'm') {
		$chk->{state} = $self->_chk_match($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'m');
		$did ++ if ($chk->{state} ne '?');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 'l') {
		$chk->{state} = $self->_chk_lookup($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'l');
		$did ++ if ($chk->{state} eq 'f');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 'f') {
		$chk->{state} = $self->_chk_follow($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'f');
		$did ++ if ($chk->{state} eq 'r');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 'r') {
		$chk->{state} = $self->_chk_resolve($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'r');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 's') {
		$chk->{state} = $self->_chk_send($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'s');
		$did ++ if ($chk->{state} eq 'c');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 'c') {
		$chk->{state} = $self->_chk_check($pms,$chk);
		$self->_chk_dbg($inf->{nam},$chk,'c');
		$did ++ if ($chk->{state} eq 'h');
		return $did unless ($self->{cloops})
	}
	if ($chk->{state} eq 'h') {
		$chk->{state} = $self->_chk_handle($pms,$chk,$inf);
		$self->_chk_dbg($inf->{nam},$chk,'h');
		$did ++ if ($chk->{state} eq '!');
		return $did unless ($self->{cloops})
	}
	return $did;
}

sub parsed_metadata {
	my ($self,$pars) = @_;
	my $pms = $pars->{permsgstatus};
	$pms->{urlredirect_start} = time();
	return 1 unless ($self->_regex || (@{$self->{dnsls}} && $pms->is_dns_available()));
	my $lst = $pms->get_uri_detail_list;
	return 1 unless ($lst && %{$lst});
	$self->_fix_cache(1);
	$pms->{urlredirect_recursions} = -1;
	$pms->{urlredirect_samedomains} = 0;
	$pms->{urlredirect_redirectors} = 0;
	$pms->{urlredirect_redirects} = 0;
	$self->{concur} = 0;
	$self->{concax} = 0;
	$self->{contot} = 0;
	$self->{dnstot} = 0;
	$self->{dnsans} = 0;
	$self->{follow} = 1;
	$self->{dnscol} = 1;
	$self->{cloops} = 0;
	my $to = Mail::SpamAssassin::Timeout->new({secs=>$self->{timeout}+5});
	my @infs = ();
	$to->run_and_catch(sub {
		@infs = $self->_find_urls(values %{$lst});
		if (@infs) {
			while (!$self->_timeout($pms)) {
				$self->{cloops} ++;
				my $did = 0;
				my $left = 0;
				foreach my $inf (@infs) {
					foreach my $chk (@{$inf->{chk}}) {
						next if ($chk->{state} eq '!' || $chk->{state} eq '?');
						$did += $self->_chk_work($pms,$chk,$inf);
						$left ++ unless ($chk->{state} eq '!' || $chk->{state} eq '?');
					}
					last if ($self->_timeout($pms));
				}
				#$self->{cloops} ++;
				last unless ($left);
				last if ($self->_timeout($pms));
				next if ($did);
				#$self->dbg('sleep');
				select(undef,undef,undef,0.25);
			}
		}
	});
	my $err = $@;
	$self->dbg('forced timeout') if ($to->timed_out);
	$self->{follow} = 0;
	$self->{dnscol} = 0 unless ($self->{cache});
	if (@infs) {
		$self->{cloops} ++;
		foreach my $inf (@infs) {
			foreach my $chk (@{$inf->{chk}}) {
				next if ($chk->{state} eq '!' || $chk->{state} eq '?');
				$self->_chk_work($pms,$chk,$inf);
				next if ($chk->{state} eq '!' || $chk->{state} eq '?');
				my $state = $chk->{state};
				$chk->{state} = '?';
				$self->_chk_dbg($inf->{nam},$chk,$state);
			}
		}
	}
	$self->dbg('connections: tot=%u par=%u dns=%i ans=%i',delete $self->{contot},delete $self->{concax},$self->{dnstot},$self->{dnsans});
	$self->_fix_cache();
	$self->dbg('stat: %s %i',$self->_timegone($pms),$self->{cloops});
	die($err) if ($err);
	return 1;
}

sub _eval_rule_check {
	my ($self,$pms,$value,$check) = @_;
	return 0 unless (defined($pms->{$value}));
	$check = 1 unless (defined($check) && $check ne '');
	unless ($check =~ /^\d+$/) {
		warn("Bad value in 'eval:$value($check)'");
		return 0;
	}
	my $r = ($pms->{$value} >= $check ) ? 1 : 0;
	$self->dbg('%s %i >= %i -> %i',$value,$pms->{$value},$check,$r);
	return $r;
}

sub urlredirect_recursions {
	my ($self,$pms,$check) = @_;
	return $self->_eval_rule_check($pms,'urlredirect_recursions',$check);
}
*urlredirect_recursion = \&urlredirect_recursions;

sub urlredirect_redirectors {
	my ($self,$pms,$check) = @_;
	return $self->_eval_rule_check($pms,'urlredirect_redirectors',$check);
}
*urlredirect_redirector = \&urlredirect_redirectors;

sub urlredirect_redirects {
	my ($self,$pms,$check) = @_;
	return $self->_eval_rule_check($pms,'urlredirect_redirects',$check);
}
*urlredirect_redirect = \&urlredirect_redirects;
*urlredirect_redirection = \&urlredirect_redirects;
*urlredirect_redirections = \&urlredirect_redirects;

sub urlredirect_tosamedomains {
	my ($self,$pms,$check) = @_;
	return $self->_eval_rule_check($pms,'urlredirect_samedomains',$check);
}
*urlredirect_tosamedomain = \&urlredirect_tosamedomains;

1;

(2009-07-10)