#!/bin/sh -e

# Usage: get-topica-messages.sh [topicalistname [startmid [endmid [wait]]]]
#  topicalistname : the name of the list to download, defaults to PerthIMC, overridden by $LIST
#  startmid : the message id of the message to start with, defaults to -a, overridden by $MID
#    you can get this by opening http://topica.com/lists/somelist/read & searching for mid=somenumber in the link of the message you want to start at; 
#    -a : Download the messages index to find out the first message id instead
#  endmid : , defaults to downloading all messages, overridden by $EMID
#  wait : The --wait parameter to wget (average time (sec) to wait between downloads), defaults to 2, overridden by $WAIT

#  the index (startmid=-a) is dumped to index.html
#  all the messages are dumped to message-MID.html files
#  all the wget output is logged to topica_wget_log.txt for your perusal (overwritten on startup)
#  in the future this will parse all the messages & dump .mbox files, which can be converted to simple html with pipermail/mailman
#  it would be great to just be able to set wget to follow links with "Next Message" as the text eg wget --follow-text="Next Message" or --follow-decision=~/bin/decide.sh 

# Requires
#  sh, echo -n, wget, sed, sort -u
#  These can be subsituted with:
#   bash/almost any shell
#   cat /dev/null
#   curl/...
#   awk/perl/python
#   uniq/awk

if [ "" = "$LIST" ]; then
	LIST="$1";
fi

if [ "" = "$LIST" ]; then
	LIST=PerthIMC;
fi

if [ "" = "$MID" ]; then
	MID="$2";
fi

if [ "" = "$MID" ]; then
	MID="-a";
fi

if [ "" = "$EMID" ]; then
	EMID="$3";
fi

if [ "" = "$EMID" ]; then
	EMID="";
fi

if [ "" = "$WAIT" ]; then
	WAIT="$4";
fi

if [ "" = "$WAIT" ]; then
	# Be nice to the server
	WAIT="2";
fi

if [ ! "$WAIT" = "" ]; then
	# Use a random wait just in case
	WAIT="--random-wait -w $WAIT";
fi

echo -n > topica_wget_log.txt;

# get the MID of the first message if there is no start message
if [ "$MID" = "-a" ]; then
	# get the start of the index
	echo -n Getting the message index
	wget http://topica.com/lists/$LIST/read?start=0 -O index.html 2>> ./topica_wget_log.txt
	if [ -a index.html ]; then
		echo done;
		# use sed to find out the first MID
		MID=`sed -ne '/\/lists\/'$LIST'\/read\/message\.html/s/\/lists\/'$LIST'\/read\/message\.html\?[^m]*mid=\([0-9]*\).*/xxxxxxxxxxxxxx\1/1p' index.html | sed -e 's/.*xxxxxxxxxxxxxx//1'`;
	else
		echo failed;
		exit;
	fi;
fi

while [ "$MID" != "" ]; do
	# Download the message
	echo -n Getting message-$MID.html ... 
	wget $WAIT http://topica.com/lists/$LIST/read/message.html?mid=$MID -O message-$MID.html 2>> ./topica_wget_log.txt
	if [ -a message-$MID.html ]; then
		echo done;
		#Get the id of the next message
		MID=`sed -ne '/Next Message/s/\/lists\/'$LIST'\/read\/message\.html\?[^m]*mid=\([0-9]*\)[^"]*">Next Message.*/xxxxxxxxxxxxxx\1/1p' message-$MID.html | sed -e 's/.*xxxxxxxxxxxxxx//1'|sort -u`; #What a fscking kludge
	else
		echo failed;
		break;
	fi;
done

# FIXME:
# Now we have the messages, so its time to convert them to .mbox files
# Ideas
	# use message ids (put in Message-ID:) & subject line (use Re: and the messages index to determine parent status) to make the messages threaded
	# strip out the bits of the message we want - need python
		# subject
		# author
		# timestamp
		# body
			# fix up broken line wrapping
			# handle the stupid mechanism for quoting (a table with a magenta cell on the left) & multiple nesting depths
	# slap those back together in mbox/mime format
	# pass the mbox to mailman/pipermail to create HTML archives