#!/usr/bin/perl
#
# Concept:
#
#  Convert a hypermail archive to an mbox, weakly.
#
# Author:
#
#  Scott Rose, rose@cs.[wisc,washington].edu.
#
#     With modifications by Kent Landfield. 
#     If it's broke, it's my fault.
#
# Usage:
#
#   hypetombox.pl [-d <directory>]
#
# $Header: /cvs/hypermail/hypermail/contrib/hypetombox.pl,v 1.2 1999/03/07 21:13:04 cvsdev Exp $

require 5.000;
use Getopt::Std;
getopts('d:');

# This is a list of the fields in the comment header of each message.

@fields = ('received', 'sent', 'name', 'email', 'subject', 'id', 'inreplyto');

# Get a list of the message files.

$fpat= '[0-9][0-9][0-9][0-9].html';
$fpat = "$opt_d/$fpat" if(defined $opt_d);

@msgs = sort glob($fpat);

# Open the output file for write.

(open MBOX, '>mbox') || die "can't open mbox";

# Loop on the input files.

# count the messages in the file
$cntr = 0;
$boguscntr = 0;

foreach $msg (@msgs) {

    # Open the message file for read.

    $cntr += 1;

    (open M, $msg) || die "can't open $msg";

    # Loop on lines in the file

    $state = 'HeaderComments';

    while(<M>) {

	# This is a header comment; save the value in a variable with the
	# same name.

	if($state eq 'HeaderComments' && /^<!-- (\w+)="([^"]+)" -->$/) {
            $key = $1;
            $value = $2;
            $value =~ s/&amp;/&/g;
            $value =~ s/&lt;/</g;
            $value =~ s/&gt;/>/g;
            $$key = $value;
        }
        else {
            if($state eq 'HeaderComments') {
                $state = 'LeadingGoo';
                if($received =~ /^(\w{3} \w{3} {1,2}\d{1,2} \d{2}:\d{2}:\d{2} \d{4})/) {
                    $date = $1;
                }
                else {
                    $date = 'Bogus date';
                    $boguscntr += 1;
                }
                print MBOX "From $email  $date\n";
                print MBOX "Date: $sent\n";
		print MBOX "Message-Id: <$id>\n";
		print MBOX "To: bogus\n";
		print MBOX "From: $email ($name)\n";
		print MBOX "Subject: $subject\n";
                if ($inreplyto) {
		    print MBOX "In-Reply-To: <$inreplyto>\n";
		}
		print MBOX "\n";
            }
            $state = 'Body' if(/^<!-- body="start" -->/);
            $state = 'TrailingGoo' if(/^<!-- body="end" -->/);
            next if(/^<!--/ || 
                    $state eq 'LeadingGoo' || $state eq 'TrailingGoo');

            # This is a body line.

            s/<br>$//; # lose the trailing <br>
            s/<BR>$//; # lose the trailing <br>
            s/<pre>$//; # lose the <pre>formatted tags
            s/<PRE>$//; # lose the <PRE>formatted tags
            s/<\/pre>$//; # lose the </pre>formatted tags
            s/<\/PRE>$//; # lose the </PRE>formatted tags
            s/<P>$//;  # lose the paragraph tags
            s/<p>$//;  # lose the paragraph tags
            s%<a href=[^>]+>([^<]+)</a>%\1%g; # lose hyperlinks
            s%<A HREF=[^>]+>([^<]+)</A>%\1%g; # lose hyperlinks
            s/&lt;/</g; # reverse map special characters
            s/&gt;/>/g;
            s/&amp;/&/g;
            s%^<i>(.+)</i>$%\1%;
            s%^<EM>(.+)</EM>$%\1%;
            s/^From />From/; # Don't let bogus message start lines in!
            print MBOX;
        }
    } # end loop on message file lines
    print MBOX "\n";

    (close M) || die "can't close $msg";


# ... then, the body:

    print MBOX @body;

} # end loop on messages

(close MBOX) || die "can't close mbox";

$good = $cntr-$boguscntr;
print "$cntr messages processed ($good good messages : $boguscntr bogus date messages)\n";

if ($boguscntr) {
    print "Bogus Date encountered, hand edit and search for \"Bogus Date\"\n";
}
