#!/usr/bin/perl -w # See catmailuniq --help # Version 1.1 # Changelog: # # 1.1 (29-Jan-2007): # * Skip uw-imapd placeholders unless -i option given use POSIX qw(strftime); # Process arguments show_help(1) if (!@ARGV); show_help(0) if ($ARGV[0] eq "--help" ||$ARGV[0] eq "-h" || $ARGV[0] eq "-?"); my $single = 0; my $batch = 0; my $process_imap = 0; if ($ARGV[0] eq "-s") { $single = 1; shift; } elsif ($ARGV[0] eq "-b") { $batch = 1; shift; } if ($ARGV[0] eq "-i") { $process_imap = 1; shift; } show_help(1) if (!@ARGV); my $workdir = shift; # Get on with it if ($single) { my @message = ; if (!$process_imap) { foreach (@message) { exit if (/^Subject: DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA$/); last if (/^$/); } } my $templeaf = unique_file("$workdir/messages"); my $tempfile = "$workdir/messages/$templeaf"; open(TEMP, "> $tempfile") || die "Unable to open $tempfile for writing\n"; print TEMP @message; close(TEMP); # Add leafname to "list" file open(LISTFILE, ">> $workdir/list") || die "Unable to open $workdir/list for writing"; print LISTFILE "$templeaf\n"; close(LISTFILE); } elsif ($batch) { batch_process(); } else { show_help(1) if (!@ARGV); my $mbox = shift; # Create workdir etc if necessary if (!-d $workdir) { mkdir($workdir) || die "Unable to create work directory $workdir"; } if (!-d "$workdir/messages") { mkdir("$workdir/messages") || die "Unable to create \"messages\" subdirectory\n"; } if (!-d "$workdir/nomsgids") { mkdir("$workdir/nomsgids") || die "Unable to create \"nomsgids\" subdirectory\n"; } open(MBOX, "$mbox") || die "Unable to read $mbox\n"; print "Extracting messages from $mbox\n"; # For some reason perl can't find catmailuniq without explicit PATH my $shpath = $ENV{"PATH"}; my $pi = ""; if ($process_imap) { $pi = "-i"; } open(FORMAIL, "|PATH=$shpath formail -s perl -wS catmailuniq -s $pi $workdir") || die "Unable to pipe to formail\n"; while () { print FORMAIL $_; } close(FORMAIL); close(MBOX); batch_process(); } sub show_help { if ($_[0] == 1) { print STDERR "catmailuniq usage error. Help follows\n"; } print "catmailuniq version 1.1.\n"; print "catmailuniq takes multiple mbox files and concatenates them, \n"; print "removing duplicates. A work directory is used to store the \n"; print "output - \"output\" - and state files so that it can be used \n"; print "repeatedly to build up one file from many.\n"; print "catmailuniq relies on formail, md5sum, diff and being able to \n"; print "call itself recursively ie all must be present in \$PATH\n"; print "Usage: catmailuniq --help\n"; print " catmailuniq -h\n"; print " catmailuniq -?\n"; print " Show this message\n"; print " catmailuniq [-i] WORKDIR FILE\n"; print " Add FILE to the output and work data in WORKDIR\n"; print " Use -i to include uw-imapd placeholders, otherwise"; print " they're skipped.\n"; print " catmailuniq -s [-i] WORKDIR (used internally)\n"; print " Take a single message on STDIN and add it to WORKDIR\n"; print " catmailuniq -b WORKDIR (used internally)\n"; print " Batch up all the individual items added by -s\n"; exit($_[0]); } # Create a unique filename in the given directory. It probably won't be unique # if you call again before creating the first file! Note that only the leafname # is returned sub unique_file { my $counter = 0; my$filename; do { $filename = sprintf("%s%s%d", strftime("%s", gmtime), $$, $counter++); } while (-f "$_[0]/$filename"); return $filename; } # Adds value $_[2] to key $_[1] in hash referenced by $_[0] sub add_md5sum { my $md5hash = $_[0]; my $key = $_[1]; my $value = $_[2]; # If the list is empty we need to create it, otherwise push new value if (undef($md5hash->{$key}) || !$md5hash->{$key}) { $md5hash->{$key} = [($value)]; } else { push(@{$md5hash->{$key}}, $value); } } # Appends file named (absolute) by $_[1] to file (handle) referenced by $_[0] sub output_message { my $outfile = $_[0]; my $inname = $_[1]; open(INFILE, "$inname") || die "Unable to read message $inname to add to output"; while () { print $outfile $_; } } sub batch_process { # Open list of new message filenames if (!open(LISTFILE, "$workdir/list")) { if (!-f "$workdir/list") { print "No $workdir/list (nothing to do)\n"; exit(0); } else { die "Unable to read $workdir/list"; } } # Load any existing list of MessageIDs and/or md5sums #print "Loading list of previously batched Message-IDs\n"; my %messageids = (); if (open(MESSAGEIDS, "$workdir/messageids")) { while () { chomp; $messageids{$_} = 1; } close(MESSAGEIDS); } # md5sums are much harder to load, because they're not necessarily unique, # so each datum value is actually a reference to an array #print "Loading list of previously batched messages without Message-IDs\n"; my %md5sums = (); if (open(MD5SUMS, "$workdir/md5sums")) { while () { chomp; my $key = $_; my $value = $_; $key =~ s/\s+.*//; $value =~ s/^\S+\s+//; add_md5sum(\%md5sums, $key, $value); } close(MD5SUMS); } # Keep track of how many messages have been added, skipped etc my $added = 0; my $duplicates = 0; my $nomsgid = 0; print "Adding batch of extracted messages to $workdir/output\n"; # Get ready to write to output file open(my $outfile, ">> $workdir/output") || die "Unable to open $workdir/output for writing"; # Now process each item in list file while () { chomp; # $_ gets lost somewhere before the end of the loop so save it my $leafname = $_; if (!-f "$workdir/messages/$leafname") { print STDERR "$workdir/messages/$leafname unreadable, " . "obsolete list file?\n"; next; } # Does message have a Message-ID? my $messageid = `formail -x Message-ID < $workdir/messages/$leafname`; # Following makes chomp superfluous $messageid =~ s/\s//g; if ($messageid) { if (defined($messageids{$messageid}) && $messageids{$messageid}) { ++$duplicates; } else { $messageids{$messageid} = 1; output_message($outfile, "$workdir/messages/$leafname"); ++$added; } unlink("$workdir/messages/$leafname"); } else { my $md5sum = `md5sum $workdir/messages/$leafname`; $md5sum =~ s/\s+.*//; $md5sum =~ s/\s//g; my $dup = 0; if (defined($md5sums{$md5sum}) && $md5sums{$md5sum}) { # Even if md5sum isn't unique, file just may be, so diff against # each file with same sum foreach (@{$md5sums{$md5sum}}) { s/\s//g; my $diff = `diff -q $workdir/nomsgids/$_ $workdir/messages/$leafname`; $diff =~ s/\s//g; if (!$diff) { $dup = 1; last; } } } if ($dup) { ++$duplicates; unlink("$workdir/messages/$leafname"); } else { ++$nomsgid; my $new_leafname = unique_file("$workdir/nomsgids"); rename("$workdir/messages/$leafname", "$workdir/nomsgids/$new_leafname"); output_message($outfile, "$workdir/nomsgids/$new_leafname"); add_md5sum(\%md5sums, $md5sum, $new_leafname); } } } close(LISTFILE); close($outfile); unlink("$workdir/list"); if ($added) { #print "Saving updated list of Message-IDs\n"; open(MESSAGEIDS, "> $workdir/messageids") || die "Unable to write updated $workdir/messageids"; foreach (keys(%messageids)) { # Keys seem to accumulate extra whitespace so remove it safely s/\s//g; print MESSAGEIDS "$_\n"; } close(MESSAGEIDS); } if ($nomsgid) { #print "Saving updated list of messages lacking Message-IDs\n"; open(MD5SUMS, "> $workdir/md5sums") || die "Unable to write updated $workdir/md5sums"; my $md5sum; my $leafnames; while (($md5sum, $leafnames) = each(%md5sums)) { # Keys seem to accumulate extra whitespace so remove it safely $md5sum =~ s/\s//g; foreach (@{$leafnames}) { s/\s//g; print MD5SUMS "$md5sum\t$_\n"; } } close(MD5SUMS); } print "Added $added unique messages, skipped $duplicates duplicates, " . "$nomsgid lacked Message-IDs\n"; } # vim:ts=8: