#!/bin/sh -e # Usage: get-topica-messages.sh [topicalistname [startmid [endmid [wait]]]] # topicalistname : the name of the list to download, defaults to PerthIMC, overridden by $LIST # startmid : the message id of the message to start with, defaults to -a, overridden by $MID # you can get this by opening http://topica.com/lists/somelist/read & searching for mid=somenumber in the link of the message you want to start at; # -a : Download the messages index to find out the first message id instead # endmid : , defaults to downloading all messages, overridden by $EMID # wait : The --wait parameter to wget (average time (sec) to wait between downloads), defaults to 2, overridden by $WAIT # the index (startmid=-a) is dumped to index.html # all the messages are dumped to message-MID.html files # all the wget output is logged to topica_wget_log.txt for your perusal (overwritten on startup) # in the future this will parse all the messages & dump .mbox files, which can be converted to simple html with pipermail/mailman # it would be great to just be able to set wget to follow links with "Next Message" as the text eg wget --follow-text="Next Message" or --follow-decision=~/bin/decide.sh # Requires # sh, echo -n, wget, sed, sort -u # These can be subsituted with: # bash/almost any shell # cat /dev/null # curl/... # awk/perl/python # uniq/awk if [ "" = "$LIST" ]; then LIST="$1"; fi if [ "" = "$LIST" ]; then LIST=PerthIMC; fi if [ "" = "$MID" ]; then MID="$2"; fi if [ "" = "$MID" ]; then MID="-a"; fi if [ "" = "$EMID" ]; then EMID="$3"; fi if [ "" = "$EMID" ]; then EMID=""; fi if [ "" = "$WAIT" ]; then WAIT="$4"; fi if [ "" = "$WAIT" ]; then # Be nice to the server WAIT="2"; fi if [ ! "$WAIT" = "" ]; then # Use a random wait just in case WAIT="--random-wait -w $WAIT"; fi echo -n > topica_wget_log.txt; # get the MID of the first message if there is no start message if [ "$MID" = "-a" ]; then # get the start of the index echo -n Getting the message index wget http://topica.com/lists/$LIST/read?start=0 -O index.html 2>> ./topica_wget_log.txt if [ -a index.html ]; then echo done; # use sed to find out the first MID MID=`sed -ne '/\/lists\/'$LIST'\/read\/message\.html/s/\/lists\/'$LIST'\/read\/message\.html\?[^m]*mid=\([0-9]*\).*/xxxxxxxxxxxxxx\1/1p' index.html | sed -e 's/.*xxxxxxxxxxxxxx//1'`; else echo failed; exit; fi; fi while [ "$MID" != "" ]; do # Download the message echo -n Getting message-$MID.html ... wget $WAIT http://topica.com/lists/$LIST/read/message.html?mid=$MID -O message-$MID.html 2>> ./topica_wget_log.txt if [ -a message-$MID.html ]; then echo done; #Get the id of the next message MID=`sed -ne '/Next Message/s/\/lists\/'$LIST'\/read\/message\.html\?[^m]*mid=\([0-9]*\)[^"]*">Next Message.*/xxxxxxxxxxxxxx\1/1p' message-$MID.html | sed -e 's/.*xxxxxxxxxxxxxx//1'|sort -u`; #What a fscking kludge else echo failed; break; fi; done # FIXME: # Now we have the messages, so its time to convert them to .mbox files # Ideas # use message ids (put in Message-ID:) & subject line (use Re: and the messages index to determine parent status) to make the messages threaded # strip out the bits of the message we want - need python # subject # author # timestamp # body # fix up broken line wrapping # handle the stupid mechanism for quoting (a table with a magenta cell on the left) & multiple nesting depths # slap those back together in mbox/mime format # pass the mbox to mailman/pipermail to create HTML archives