#!/usr/bin/perl -W ############################################################################# # ndb2cf.pl - A script for converting SaneSecurity *.ndb signature files to # # SpamAssassin *.cf rule files. # # # # Copyright 2008 by Robert LeBlanc # # # # Typical usage: # # # # ndb2cf.pl --ndb_file phish.ndb --cf_file 80_SS_Phish.cf # # # # or # # # # cat phish.ndb | ndb2cf.pl --cf_file 80_SS_Phish.cf # # # # or # # # # cat phish.ndb | ndb2cf.pl > 80_SS_Phish.cf # # # # # # Additional options: # # # # --boilerplate Prepends boilerplate comment text (e.g. a # # license/credits document, etc.) to the # # rule file output. Comment markers will be # # inserted as necessary. # # # # --score Assigns a score to all rules. The default # # value is 0.001, suitable for testing. # # # # --rule-prefix Assigns a prefix to rule names, for easier # # identification. The default prefix is 'SS' # # (for SaneSecurity). # # # # --include-sigs Includes the SaneSecurity ClamAV signatures # # in the comments, mainly for debugging # # purposes and verifying the converted regular # # expression patterns. # # # # --verbose Writes timestamped output to STDOUT, suitable # # for logging purposes and running the script # # from cron. # # # # Note that this script is not designed to convert general ClamAV # # signature files to SpamAssassin rules. It has been specifically # # designed to convert the phish.ndb and scam.ndb signature files # # published by SaneSecurity . # # # # Note also that there are more than 20,000 rules between these two # # signature files, and as such they will occupy a large amount of memory # # and will probably slow SpamAssassin considerably. Use of the # # sa-compile utility is strongly recommended to optimize these regular # # expressions. # # # # This script is provided as-is, without warranty of any kind--use it and # # the rules it generates at your own risk. # # # ############################################################################# use strict; use Getopt::Long; use Class::Struct; use POSIX qw(strftime); struct Signature => { sig_name => '$', # ClamAV signature name type => '$', # ClamAV target type (always 0, 3 or 4) offset => '$', # ClamAV offset (always '*', not used) hexsig => '$', # ClamAV hex signature rule_name => '$', # SpamAssassin rule name rule_type => '$', # SpamAssassin rule type rule_desc => '$', # SpamAssassin rule description rule_pat => '$', # SpamAssassin rule pattern }; # prototypes sub assign_rule_description($); sub assign_rule_name($); sub assign_rule_pattern($); sub assign_rule_type($); sub convert_signatures_to_rules($); sub escape_regex_metachars($); sub fatal($); sub output($); sub read_ndb_file($$); sub write_cf_file($$); # name of this script for logging purposes my $script_name = "ndb2cf"; # defaults (overridden by command line options) my $ndb_file = "stdin"; my $cf_file = "stdout"; my $rule_prefix = "SS"; my $boilerplate = ""; my $score = 0.001; my $verbose = 0; my $include_sigs = 0; my $help = 0; my @Signatures = (); # Specific descriptions for rules by signature category # Source: my @specific_descriptions = ( {pat => '.bou.gen', desc => "scam"}, {pat => ".cred.gen", desc => "credit scam"}, {pat => ".dipl.gen", desc => "diploma scam"}, {pat => ".doc.gen", desc => "phishing attempt"}, {pat => ".hdr.sanesecurity.", desc => "header signature"}, {pat => ".img.gen", desc => "image spam"}, {pat => ".imgo.gen", desc => "OEM image spam"}, {pat => ".job.gen", desc => "job scam"}, {pat => ".loan.gen", desc => "loan scam"}, {pat => ".malware.", desc => "links to malware"}, {pat => ".phishing.auction.", desc => "eBay phish"}, {pat => ".phishing.azon.", desc => "Amazon phish"}, {pat => ".phishing.bank.", desc => "bank phish"}, {pat => ".phishing.card.", desc => "credit card phish"}, {pat => ".phishing.cur.gen", desc => "phishing attempt"}, {pat => ".phishing.dca.gen", desc => "DoubleClick revenue link"}, {pat => ".phishing.fake.", desc => "phishing attempt"}, {pat => ".phishing.gens.", desc => "phishing attempt"}, {pat => ".phishing.giftcard.", desc => "gift card phish"}, {pat => ".phishing.hex.gen", desc => "links contain hex"}, {pat => ".phishing.ivt.gen", desc => "invalid tags"}, {pat => ".phishing.jsc.gen", desc => "phishing attempt"}, {pat => ".phishing.nam.gen", desc => "common fake HTML editor"}, {pat => ".phishing.onf.gen", desc => "phishing attempt"}, {pat => ".phishing.pay.", desc => "PayPal phish"}, {pat => ".phishing.rdi.gen", desc => "redirect"}, {pat => ".phishing.rock.", desc => "phishing attempt"}, {pat => ".phishing.rockgen", desc => "phishing attempt"}, {pat => '.phishing.sanesecurity.testsig$', desc => "phishing test signature"}, {pat => ".phishing.shop.", desc => "merchant phish"}, {pat => ".phishing.slw.gen", desc => "phishing attempt"}, {pat => ".phishing.url.", desc => "file with phishing URL"}, {pat => ".phishing.wrd.gen", desc => "phishing attempt"}, {pat => ".porn.gen", desc => "porn scam"}, {pat => ".sanesecurity.testsig_type[34]_bdy", desc => "phishing test signature"}, {pat => ".sanesecurity.testsig_type4_hdr", desc => "phishing test signature"}, {pat => ".scam.gen", desc => "scam"}, {pat => ".scam.sanesecurity.testsig", desc => "scam test signature"}, {pat => ".scam4.gen", desc => "419 scam"}, {pat => ".scaml.gen", desc => "lottery scam"}, {pat => ".spam.gen", desc => "spam"}, {pat => ".spam.sanesecurity.url", desc => "file with a blacklisted URL"}, {pat => ".stk.gen", desc => "stock scam"}, ); # General descriptions for rules by signature category my @general_descriptions = ( {pat => ".phishing.", desc => "phishing attempt"}, {pat => ".scam.", desc => "scam"}, {pat => ".spam.", desc => "spam"}, ); # Parse the command line for options and overrides GetOptions("ndb-file=s" => \$ndb_file, "cf-file=s" => \$cf_file, "score=f" => \$score, "rule-prefix=s" => \$rule_prefix, "boilerplate=s" => \$boilerplate, "include-sigs" => \$include_sigs, "verbose" => \$verbose, "help" => \$help); if ($help) { output("Usage: ndb2cf.pl\n" . " --ndb-file file : read input from a ClamAV *.ndb file (default: STDIN)\n" . " --cf-file file : write output to a SpamAssassin *.cf file (default: STDOUT)\n" . " --score value : assign this score to all rules (default: 0.001)\n" . " --rule-prefix prefix : prefix rule names with a character string (default: 'SS')\n" . " --boilerplate file : prepend boilerplate text from a file (default: none)\n" . " --include-sigs : include ClamAV hex signatures in comments (default: off)\n" . " --verbose : verbose output\n" . " --help : display this help text"); exit; } # Make sure the rule prefix is in upper case $rule_prefix = uc($rule_prefix); output("Starting.") if ($verbose); my $signatures_read = read_ndb_file($ndb_file, \@Signatures); output("$signatures_read signatures read.") if ($verbose); if ($signatures_read > 0) { output("Assigning all rules a score of $score.") if ($verbose); output("Prefixing all rules with '$rule_prefix'.") if ($verbose); convert_signatures_to_rules(\@Signatures); my $rules_written = write_cf_file($cf_file, \@Signatures); output("$rules_written rules written.") if ($verbose); } else { fatal("No signatures read!"); } output("Shutting down.") if ($verbose); exit; ######################################################################## ### End of Script: Subroutines begin below ### ######################################################################## # Die, printing a time-stamped error message. sub fatal($) { my ($msg) = @_; output("FATAL ERROR: " . $msg); exit 1; } # Write a time-stamped string to stdout for logging purposes. sub output($) { my ($msg) = @_; my ($second, $minute, $hour, $day, $month, $year) = (localtime)[0,1,2,3,4,5]; printf("%04d-%02d-%02d %02d:%02d:%02d [%s] %s\n", $year+1900, $month+1, $day, $hour, $minute, $second, $script_name, $msg); } # Parse a SaneSecurity *.ndb signature file sub read_ndb_file($$) { my($ndb_file, $Signatures_ref) = @_; my $fh = *STDIN; if ($ndb_file ne "stdin") { open (INFILE, "< $ndb_file") or fatal("Can't read input file $ndb_file: $!"); $fh = *INFILE; output("Reading signatures from $ndb_file.") if ($verbose); } else { output("Reading signatures from STDIN.") if ($verbose); } while (<$fh>) { chomp; my $sig = Signature->new(); my @field = split(/:/, $_); $sig->sig_name($field[0]); $sig->type($field[1]); $sig->offset($field[2]); # strip trailing CR if chomp missed it if ($field[3] =~ /^(.+)\x0d$/) { $sig->hexsig($1); } else { $sig->hexsig($field[3]); } $sig->rule_name(""); $sig->rule_desc(""); $sig->rule_type(""); $sig->rule_pat(""); push(@$Signatures_ref, $sig); } close $fh if ($ndb_file ne "stdin"); return scalar(@$Signatures_ref); } # Write a SpamAssassin *.cf rule file sub write_cf_file($$) { my($cf_file, $Signatures_ref) = @_; my $fh = *STDOUT; if ($cf_file ne "stdout") { open (OUTFILE, "> $cf_file") or fatal("Can't write output file $cf_file: $!"); $fh = *OUTFILE; output("Writing rules to $cf_file.") if ($verbose); } else { output("Writing rules to STDOUT.") if ($verbose); } # Prepend any supplied boilerplate text, inserting # comment markers at the beginning of each line as # necessary. if ($boilerplate ne "") { open (INFILE, "< $boilerplate") or fatal("Can't read boilerplate file $boilerplate: $!"); my $line; while (defined($line = )) { chomp $line; if ($line =~ /^\s*\#(.+)?$/) { printf $fh "#" . $1 . "\n"; } else { printf $fh "# " . $line . "\n"; } } close INFILE; printf $fh "\n"; } # Filename if ($cf_file ne "stdout") { printf $fh "# File: $cf_file\n"; } # Timestamp my $now = POSIX::strftime("%Y-%m-%d %H:%M:%S", gmtime); printf $fh "# Auto-generated: $now UTC\n"; # Rule count my $rule_count = scalar(@$Signatures_ref); printf $fh "# Rule count: $rule_count\n"; printf $fh "\n"; my $rules_written = 0; foreach my $sig (@$Signatures_ref) { printf $fh "# %s\n", $sig->sig_name; printf $fh "# ClamAV signature: %s\n", $sig->hexsig if ($include_sigs); printf $fh "%-8s %s %s\n", $sig->rule_type, $sig->rule_name, $sig->rule_pat; printf $fh "describe %s %s\n", $sig->rule_name, $sig->rule_desc; printf $fh "score %s %0.3f\n\n", $sig->rule_name, $score; $rules_written++; } close $fh if ($cf_file ne "stdout"); return $rules_written; } # Construct a valid SpamAssassin rule name based on a SaneSecurity signature name # # * Names must be all-caps # * Names can contain only A-Z, 0-9, and the underscore # * Names should start with an author identifier, in this case "SS_" # * Names should (ideally) be no longer than 22 characters sub assign_rule_name($) { my($sig_name) = @_; my $rule_name = $rule_prefix; my @field = split(/\./, $sig_name); # Field 1: Email or HTML if (defined($field[0])) { # do nothing } #Field 2: Doc, Malware, Phishing, Bou, Cred, Dipl, Hdr, Img, ImgO, # Job, Loan, Porn, Scam, Scam4, ScamL, Spam, Stk if (defined($field[1])) { if (lc($field[1]) eq 'sanesecurity') { # do nothing } elsif (lc($field[1]) eq 'malware') { $rule_name .= "_MALWR"; } elsif (lc($field[1]) eq 'phishing') { $rule_name .= "_PHISH"; } else { $rule_name .= "_" . uc($field[1]); } } # Field 3: Auction, Azon, Bank, Card, Cur, Dca, Fake, Gen* # GiftCard, Hex, Ivt, Jsc, Nam, Onf, Pay, Rdi, Rock # RockGen*, Shop, TestSig_Type3_Bdy, TestSig_Type4_Bdy # TestSig_Type4_Hdr, Url, Wrd if (defined($field[2])) { if (lc($field[2]) eq 'sanesecurity') { # do nothing } elsif (lc($field[2]) eq 'testsig_type3_bdy') { $rule_name .= "_TEST3B"; } elsif (lc($field[2]) eq 'testsig_type4_bdy') { $rule_name .= "_TEST4B"; } elsif (lc($field[2]) eq 'testsig_type4_hdr') { $rule_name .= "_TEST4H"; } elsif (lc($field[2]) eq 'giftcard') { $rule_name .= "_GIFT"; } elsif (lc($field[2]) eq 'auction') { $rule_name .= "_AUCT"; } elsif (lc($field[2]) =~ /^gen(.*)$/) { $rule_name .= "_G" . uc($1); } elsif (lc($field[2]) =~ /^rockgen(.*)$/) { $rule_name .= "_RG" . uc($1); } else { $rule_name .= "_" . uc($field[2]); } } # Field 4: (date), Gen*, Rockv2Gen*, TestSig, Url_*, Web if (defined($field[3])) { if (lc($field[3]) eq 'sanesecurity') { # do nothing } elsif (lc($field[3]) eq 'testsig') { $rule_name .= "_TEST"; } elsif (lc($field[3]) =~ /^gen(.*)$/) { $rule_name .= "_G" . uc($1); } elsif (lc($field[3]) =~ /^rockv2gen(.*)$/) { $rule_name .= "_RGV2" . uc($1); } elsif (lc($field[3]) =~ /^url_(\d+)$/) { $rule_name .= "_URL" . scalar($1); } else { $rule_name .= "_" . uc($field[3]); } } # Field 5: if (defined($field[4])) { if (lc($field[4]) eq 'sanesecurity') { # do nothing } elsif (lc($field[4]) eq 'bobaxspam') { $rule_name .= "_BOBAX"; } elsif (lc($field[4]) eq 'wmascript') { $rule_name .= "_WMA"; } else { $rule_name .= "_" . uc($field[4]); } } # Field 6: if (defined($field[5])) { if (lc($field[5]) =~ /^sqlinj_(\d+)$/) { $rule_name .= "_SQLINJ" . $1; } else { $rule_name .= "_" . uc($field[5]); } } # Field 7: if (defined($field[6])) { $rule_name .= "_" . uc($field[6]); } return $rule_name; } # Determine the SpamAssassin rule type to use for a SaneSecurity signature # # ClamAV signature types: # 0 = Any file # 1 = Portable executable # 2 = OLE2 component (e.g. a VBA script) # 3 = HTML (normalised) # 4 = Mail file # 5 = Graphics # 6 = ELF # 7 = ASCII text file (normalised) # # In practice, SaneSecurity signatures are only of type 0, type 3 # (HTML), or type 4 (text-based email). Since HTML is only valid # in the body of an email, we can use "body" checks for those. # Type 0 and 4 signatures could be testing headers as well, so for # those we need to use "rawbody" to check the entire email. sub assign_rule_type($) { my($sig_type) = @_; my $rule_type = "rawbody"; if ($sig_type == 3) { $rule_type = "body"; } return $rule_type; } # Construct a meaningful text description of the SpamAssassin rule # # Source: http://www.sanesecurity.co.uk/clamav/docs.htm # # Per SpamAssassin convention, descriptions should be no longer than # 50 characters. # # Try to find a specific description first, if possible. Try for a # more general description if that fails. If all else fails, use # the SaneSecurity signature's name as the description. sub assign_rule_description($) { my($sig_name) = @_; my $rule_desc = "SaneSecurity: "; if (lc($sig_name) =~ /^html\./) { $rule_desc .= "HTML-based "; } elsif (lc($sig_name) =~ /^email\./) { $rule_desc .= "Text-based "; } # First attempt: try descriptions for known signature categories my $rule_desc2 = ""; foreach my $description (@specific_descriptions) { my %hash = %$description; my $pat = $hash{"pat"}; my $desc = $hash{"desc"}; if ($sig_name =~ /$pat/i) { $rule_desc2 = $desc; next; } } # Second chance: try more general description categories if ($rule_desc2 eq "") { foreach my $description (@general_descriptions) { my %hash = %$description; my $pat = $hash{"pat"}; my $desc = $hash{"desc"}; if ($sig_name =~ /$pat/i) { $rule_desc2 = $desc; next; } } } # Last resort: use the signature name itself as the description if ($rule_desc2 eq "") { $rule_desc2 = $sig_name; } return ($rule_desc . $rule_desc2); } # Escape characters with special meanings in regular expression contexts # # . % ( ) [ ] $ + ; / \ & ? # ^ * { } | @ sub escape_regex_metachars($) { my($char) = @_; my $char_str = $char; # Quote regex metacharacters if ($char =~ /^[\.\%\(\)\[\]\$\+\;\/\\\&\?\#\^\*\{\}\|\@]$/) { $char_str = '\\' . $char_str; } # Convert unprintable characters to hex elsif ((ord($char) < 32) || (ord($char) > 126)) { $char_str = sprintf("\\x%02x", ord($char)); } return $char_str; } # Construct a regular expression out of a SaneSecurity hex signature # # The following ClamAV wildcards are supported: # # * : Match any number of bytes, i.e. ".*" # # {n} : Match exactly n bytes, i.e. ".{n}" # # {-n} : Match n or fewer bytes, i.e. ".{0,n}" # # {n-} : Match n or more bytes, i.e. ".{n,}" # # (aa|bb|cc|..) : Match aa or bb or cc or... # # The "??", "a?" and "?a" wildcards are NOT supported in this version, # as they are not currently used in SaneSecurity signatures. # # The pattern returned is always case-insensitive, since the # SaneSecurity signatures are normalised to lower case. sub assign_rule_pattern($) { my($hexsig) = @_; my $rule_pat = ""; my @sig_chars = split(//, $hexsig); my $sig_chars_ref = \@sig_chars; my $i = 0; while ($i <= $#$sig_chars_ref) { # Hex digit pair if ($sig_chars_ref->[$i] =~ /[0-9A-F]/i) { my $hexnum = "0x" . $sig_chars_ref->[$i] . $sig_chars_ref->[++$i]; $rule_pat .= escape_regex_metachars(chr(hex($hexnum))); } # * => .* elsif ($sig_chars_ref->[$i] eq '*') { $rule_pat .= ".*"; } # {n}, {-n}, {n-} elsif ($sig_chars_ref->[$i] eq '{') { my $buf = ""; while ($sig_chars_ref->[++$i] ne '}') { $buf .= $sig_chars_ref->[$i]; } # {-n} => .{0,n} if ($buf =~ /^\-(\d+)$/) { $rule_pat .= ".{0," . $1 . "}"; } # {n-} => .{n,} elsif ($buf =~ /^(\d+)\-$/) { $rule_pat .= ".{" . $1 . ",}"; } # {n-m} => .{n,m} elsif ($buf =~ /^(\d+)\-(\d+)$/) { $rule_pat .= ".{" . $1 . "," . $2 . "}"; } # {n} => .{n} elsif ($buf =~ /^(\d+)$/) { $rule_pat .= ".{" . $1 . "}"; } # Syntax error! else { fatal("Syntax error at character $i of signature: $hexsig"); } } # (aa|bb|cc|..) elsif ($sig_chars_ref->[$i] =~ /[\(\)\|]/) { $rule_pat .= $sig_chars_ref->[$i]; } # Syntax error! else { fatal("Syntax error at character $i of signature: $hexsig"); } $i++; } return ('/' . $rule_pat . '/i'); } # Convert a list of SaneSecurity signatures to SpamAssassin rules sub convert_signatures_to_rules($) { my($Signatures_ref) = @_; foreach my $sig (@$Signatures_ref) { $sig->rule_name(assign_rule_name($sig->sig_name)); $sig->rule_type(assign_rule_type($sig->type)); $sig->rule_desc(assign_rule_description($sig->sig_name)); $sig->rule_pat(assign_rule_pattern($sig->hexsig)); } }