tools:db:mailboximporter
no way to compare when less than two revisions
Differences
This shows you the differences between two versions of the page.
Last revision | |||
— | tools:db:mailboximporter [2019/01/12 17:53] – external edit 127.0.0.1 | ||
---|---|---|---|
Line 1: | Line 1: | ||
+ | My workflow is a little convoluted, because I was going for more of a quick-n-dirty solution than a fully-featured one, but here goes. | ||
+ | I wanted to convert a bunch of mbox (email) data to FlatPress entries. I'm using Mozilla Thunderbird, | ||
+ | |||
+ | I needed to parse out the date, subject, author, and text from the emails, and then feed that into FP somehow. Here's what I did: | ||
+ | |||
+ | 1) Inside my mail reader, I put all the relevant emails into the same folder. (call it " | ||
+ | 2) from the filesystem, I made a working copy of that folder. (a file called " | ||
+ | 3) I used uudeview (actually, xdeview, the X11 version of it) to separate the attachments from the text.\\ | ||
+ | 3a) In xdeview, I turned on " | ||
+ | 4) I concatenated all the 0*.txt files into one big file (FP_posts.txt) that now just contains the text parts of the emails.\\ | ||
+ | 5) I ran this file through the readmbox.pl Perl script that I wrote (code below)\\ | ||
+ | 5a) This creates FP_Posts.txt.csv (although really delimited by stuff other than commas -- read the code) with just the fields that FP is interested in.\\ | ||
+ | 6) I uploaded the CSV to a new folder on the webserver, .../ | ||
+ | 7) I uploaded mboxtoflatpress.php, | ||
+ | 8) go to URL .../ | ||
+ | 8a) This will print out subject lines of each message as they are created on the blog\\ | ||
+ | 9) Go to Admin-> | ||
+ | |||
+ | The script posts each entry:\\ | ||
+ | - as drafts, in case there are some you don't want going public immediately.\\ | ||
+ | - with the date/time that appears in the email\\ | ||
+ | - with " | ||
+ | - with " | ||
+ | |||
+ | Here's readmbox.pl: | ||
+ | < | ||
+ | # | ||
+ | # readmbox.pl - parse an mbox-style file into a CSV-style file | ||
+ | # | ||
+ | # Created 03.05.2009 by Jimbo S. Harris | ||
+ | # | ||
+ | # This file reads a source file off the command line | ||
+ | # and parses out the following fields from each email in the file: | ||
+ | # | ||
+ | # From: To: Date: Subject: and the message body | ||
+ | # | ||
+ | # it makes some attempt to handle multipart messages, but it's best if they' | ||
+ | # | ||
+ | # The output is in the form: | ||
+ | # | ||
+ | # FROM|TO|SUBJECT|DATE|`BODY`|ø | ||
+ | # | ||
+ | # with one such entry in the output file for each recognized message in the input file. | ||
+ | # | ||
+ | # The output file has the same name as the input file, with " | ||
+ | # | ||
+ | # The output file is designed to be read by mboxtoflatpress.php | ||
+ | # | ||
+ | |||
+ | # open input and output files | ||
+ | open( IN, $ARGV[0] ) || die "could not open the input file" . $ARGV[0]; | ||
+ | open( OUT, ">" | ||
+ | |||
+ | #variable declarations | ||
+ | $state = " | ||
+ | $body = ""; | ||
+ | $boundary = ""; | ||
+ | $hasboundary = 0; | ||
+ | $delim=" | ||
+ | $text_delim=" | ||
+ | $eol_delim=" | ||
+ | $printdots=1; | ||
+ | $count=0; | ||
+ | $save=1; | ||
+ | $|=1; | ||
+ | |||
+ | print $state . " | ||
+ | |||
+ | # traverse the input file one line at a time | ||
+ | while( <IN> ) | ||
+ | { | ||
+ | $line = ""; | ||
+ | $line = $_; | ||
+ | chop $line; | ||
+ | |||
+ | # let the user know (on STDOUT) if a lot of data is passing through | ||
+ | $count++; | ||
+ | print " " . $count . " " if( $count % 1000 == 0 ); | ||
+ | |||
+ | # read off the headers first | ||
+ | if( $state eq " | ||
+ | { | ||
+ | print " | ||
+ | ($junk, $from) = split( /:/, $line, 2 ) if( $line =~ /^From:/ ); | ||
+ | ($junk, $to) = split( /:/, $line, 2 ) if( $line =~ /^To:/ ); | ||
+ | ($junk, $subject) = split( /:/, $line, 2 ) if( $line =~ /^Subject:/ ); | ||
+ | ($junk, $date) = split( /:/, $line, 2 ) if( $line =~ /^Date:/ ); | ||
+ | |||
+ | # this is an attempt to parse multipart content properly. It worked OK, but can be buggy. | ||
+ | if( $line =~ /boundary/ ) | ||
+ | { | ||
+ | #print " | ||
+ | |||
+ | #$boundary = $_; | ||
+ | |||
+ | #$boundary =~ s/ | ||
+ | ($junk, $boundary) = split( /=/, $line, 2 ) ; | ||
+ | |||
+ | $boundary =~ s/ | ||
+ | |||
+ | $boundary = " | ||
+ | |||
+ | #print " | ||
+ | $hasboundary = 1; | ||
+ | #print "got boundary: " . $boundary . " | ||
+ | } | ||
+ | |||
+ | # if you are done with the headers, switch to processing the message body | ||
+ | if( $hasboundary == 1 ) | ||
+ | { | ||
+ | if( $line =~ / | ||
+ | { | ||
+ | $state = " | ||
+ | print $count . " | ||
+ | $count = 0; | ||
+ | } | ||
+ | } | ||
+ | else | ||
+ | { | ||
+ | if( $line =~ /^$/ ) # if you're in the header in a single-part message, look for a blank line | ||
+ | { | ||
+ | $state = " | ||
+ | $boundary = "From "; | ||
+ | print $count . " | ||
+ | $count = 0; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | |||
+ | # process the message body | ||
+ | elsif( $state eq " | ||
+ | { | ||
+ | print " | ||
+ | |||
+ | # the boundary represents a state transition | ||
+ | if( $line =~ / | ||
+ | { | ||
+ | if( $hasboundary == 1 ) | ||
+ | { | ||
+ | # if you're in a multipart message, and you're saving off the body, | ||
+ | # once you find the next boundary, just " | ||
+ | # while looking for the "From " (the beginning of the following message) | ||
+ | $boundary = "From "; | ||
+ | $hasboundary = 0; | ||
+ | print $count . " | ||
+ | $count = 0; | ||
+ | $save = 0; | ||
+ | } | ||
+ | else | ||
+ | { | ||
+ | # if you're just looking for the beginning of the next message ("From - ") | ||
+ | # and you've found it, then spit out the current message and start a new one. | ||
+ | print $count . " | ||
+ | print OUT $from . $delim . $to . $delim . $subject . $delim . $date . $delim . $text_delim . $body . $text_delim . $delim . $eol_delim if( $from ); | ||
+ | $state = " | ||
+ | $body = ""; | ||
+ | $from = ""; | ||
+ | print " | ||
+ | $count = 0; | ||
+ | $boundary = ""; | ||
+ | $hasboundary = 0; | ||
+ | $save = 1; | ||
+ | } | ||
+ | |||
+ | } | ||
+ | # normal condition -- haven' | ||
+ | elsif( $save == 1 ) | ||
+ | { | ||
+ | print " | ||
+ | $body = $body . $line . " | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | close( IN ); | ||
+ | close(OUT); | ||
+ | </ | ||
+ | |||
+ | Here's mboxtoflatpress.php: | ||
+ | <code php> | ||
+ | <?php | ||
+ | /* mboxtoflatpress.php - post a CSV to FlatPress | ||
+ | |||
+ | Created 3.6.2009 by Jimbo S. Harris | ||
+ | |||
+ | This script reads input files created by readmbox.pl | ||
+ | The files are in the form: | ||
+ | |||
+ | FROM|TO|SUBJECT|DATE|`BODY`|ø | ||
+ | |||
+ | with possibly multiple entries per file | ||
+ | |||
+ | use $debug to see what the parsed entries look like without posting them | ||
+ | */ | ||
+ | |||
+ | |||
+ | // FlatPress initialization routines | ||
+ | include ' | ||
+ | include INCLUDES_DIR .' | ||
+ | |||
+ | system_init(); | ||
+ | |||
+ | if(!user_loggedin()) die(' | ||
+ | // end FlatPress init | ||
+ | //put month names and corosponding number in array | ||
+ | |||
+ | $monthnum = array( | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | " | ||
+ | ); | ||
+ | |||
+ | // Set $debug to 1 in order to see the CSV get parsed without actually posting anything. | ||
+ | $debug = 0; | ||
+ | |||
+ | // Grab the filename off of the URL .../ | ||
+ | $file = ( ( isset( $_GET[' | ||
+ | $mbox = file_get_contents( $file ); | ||
+ | |||
+ | // ø is the delimiter between messages | ||
+ | $emails = preg_split( "/ | ||
+ | |||
+ | foreach( $emails as $email ) | ||
+ | { | ||
+ | // | is the delimiter between fields. See field order in the list() | ||
+ | list( $from, $to, $subject, $date, $body ) = preg_split( "/ | ||
+ | |||
+ | // ` is the delimiter surrounding the body | ||
+ | list( $body ) = preg_split( "/ | ||
+ | |||
+ | // need to turn the date/time into a timestamp. | ||
+ | // Example: Sun, 01 Feb 2009 10:59:37 -0800 | ||
+ | if( $debug == 1 ) | ||
+ | { | ||
+ | $datefields = preg_split( "/[ :]/", $date, -1, PREG_SPLIT_NO_EMPTY ); | ||
+ | echo " | ||
+ | print_r( $datefields ); | ||
+ | echo "< | ||
+ | } | ||
+ | else | ||
+ | { | ||
+ | list( $junk, $day, $month, $year, $hour, $minute, $second, $junk ) = preg_split( "/[ :]/", $date, -1, PREG_SPLIT_NO_EMPTY ); | ||
+ | $timestamp = mktime($hour, | ||
+ | //echo "< | ||
+ | echo "< | ||
+ | } | ||
+ | |||
+ | |||
+ | // $entry is what gets posted to FlatPress | ||
+ | $entry = array( ' | ||
+ | |||
+ | if( $debug ) | ||
+ | { | ||
+ | echo "next entry:< | ||
+ | print_r( $entry ); | ||
+ | echo "< | ||
+ | } | ||
+ | else | ||
+ | { | ||
+ | echo " | ||
+ | //$id = entry_save($entry); | ||
+ | $id = draft_save($entry); | ||
+ | } | ||
+ | } | ||
+ | ?> | ||
+ | </ |