#!/usr/bin/perl -w # sa-effective.pl # Author: Mike Jackson # This script is designed to help you find which SpamAssassin rules are most # effective at tagging messages as spam...or at least rules other than the # automatic whitelisting or other mechanisms with dynamic scores (since no # scores are published in the rulesets). Here's how it works. It looks through # your existing mail logs for spamd entries. It grabs the required hits for that # message, its final score, and the rule(s) that it matched. It then subtracts # that rule's score from the total hits. If the result is lower than the # required hits, then it deems that that rule pushed the message over the # required hits threshold. Keep in mind that it could count multiple rules per # message; I thought that was only fair. use strict; # Look through likely ruleset locations for rule files that contain score lines, # then save the scores. Note that the path list is in a specific order, as # config files in the latter directories can override scores from the previous # files. my $version = `spamassassin -V`; unless($version =~ /version 3/) { die "This script requires SpamAssassin version 3.0 and up.\n"; } my %rulescore; my %rulefile; foreach my $dir (qw(/usr/share/spamassassin /usr/local/share/spamassassin /var/lib/spamassassin /etc/mail/spamassassin)) { next unless -d $dir; my @filelist = `find $dir -type f -name '*.cf' -exec grep -l '^score' {} \\; 2>/dev/null`; foreach my $file (@filelist) { chomp $file; open(BLAH,$file) or next; while() { chomp; my $line = $_; # SpamAssassin rule definitions can either have a single score, or a set of four # scores, with the correct one selected depending on if Bayes and/or DNS-based # tests are enabled. When all four are available, I use the score that assumes # Bayes and DNS tests are enabled. if($line =~ /score\s+(\S+)\s(-?[0-9\.]+)\s+(-?[0-9\.]+)\s(-?[0-9\.]+)\s(-?[0-9\.]+)/) { $rulescore{$1} = $5; $rulefile{$1} = $file; } elsif($line =~ /score\s+(\S+)\s(-?[0-9\.]+)/) { $rulescore{$1} = $2; $rulefile{$1} = $file; } } close(BLAH); } } # I work for a hosting provider that often installs Plesk on customer servers. # It stores its logs in a different location, so to make it easier for my fellow # techs to use this script, we'll override the log source if necessary. my $logdir = '/var/log'; if(-d '/usr/local/psa/var/log') { $logdir = '/usr/local/psa/var/log'; } my %rulediff; opendir(DIR,$logdir) or die "could not open $logdir\: $!\n"; while (defined(my $file = readdir(DIR))) { next unless $file =~ /^maillog/; # I realize it's not normal for rotated logfiles to be in any format other than # gzip, but I come from a FreeBSD background and bzip2 is one of the log # rotation options...and the one that I happen to use on my BSD boxes. So, we # test for the format and use the correct grep permutation. my $type = `file $logdir/$file`; chomp $type; my $searchcmd = 'grep'; if($type =~ /bzip2 compressed/i) { $searchcmd = 'bzgrep'; } elsif($type =~ /gzip compressed/i) { $searchcmd = 'zgrep'; } my @lines = `$searchcmd 'spamd: result: Y' $logdir/$file`; foreach my $line (@lines) { chomp $line; if($line =~ /result\: Y (\d+) - (\S+) .+required_score=(-?[0-9\.]+)/) { my $score = $1; my $matchtmp = $2; my $required = $3; my @matches = split ',',$matchtmp; foreach my $rule (@matches) { next unless defined($rulescore{$rule}); if(($score - $rulescore{$rule}) < $required) { $rulediff{$rule}++; } } } } } closedir(DIR); foreach my $rule (sort { $rulediff{$b} <=> $rulediff{$a} } keys %rulediff) { print "Rule: $rule\n"; print "Count: $rulediff{$rule}\n"; print "Score file: $rulefile{$rule}\n\n"; }