commit 67f4b1d32fa1dce2704c7bc21eb461875356962c
parent eabc808e1f79297aff65bb9e3adacf59a72b90ba
Author: William Morgan <wmorgan-sup@masanjin.net>
Date: Sun, 17 May 2009 11:37:43 -0700
check for a correct date on mbox From lines
Determine a splitting line by looking for a From, a something else,
and a parseable date.
Diffstat:
3 files changed, 70 insertions(+), 6 deletions(-)
diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb
@@ -10,9 +10,23 @@ module Redwood
##
## TODO: move functionality to somewhere better, like message.rb
module MBox
- BREAK_RE = /^From \S+@\S+ /
+ BREAK_RE = /^From \S+ (.+)$/
HEADER_RE = /\s*(.*?)\s*/
+ def is_break_line? l
+ l =~ BREAK_RE or return false
+ time = $1
+ begin
+ ## hack -- make Time.parse fail when trying to substitute values from Time.now
+ Time.parse time, 0
+ true
+ rescue NoMethodError
+ Redwood::log "found invalid date in potential mbox split line, not splitting: #{l.inspect}"
+ false
+ end
+ end
+ module_function :is_break_line?
+
def read_header f
header = {}
last = nil
@@ -70,7 +84,7 @@ module MBox
def read_body f
body = []
f.each_line do |l|
- break if l =~ BREAK_RE
+ break if is_break_line?(l)
body << l.chomp
end
body
diff --git a/lib/sup/mbox/loader.rb b/lib/sup/mbox/loader.rb
@@ -56,7 +56,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek offset
l = @f.gets
- unless l =~ BREAK_RE
+ unless MBox::is_break_line? l
raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}."
end
header = MBox::read_header @f
@@ -72,7 +72,7 @@ class Loader < Source
## "From" at the start of a message body line.
string = ""
l = @f.gets
- string << l until @f.eof? || (l = @f.gets) =~ BREAK_RE
+ string << l until @f.eof? || MBox::is_break_line?(l = @f.gets)
RMail::Parser.read string
rescue RMail::Parser::Error => e
raise FatalSourceError, "error parsing mbox file: #{e.message}"
@@ -107,7 +107,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek offset
yield @f.gets
- until @f.eof? || (l = @f.gets) =~ BREAK_RE
+ until @f.eof? || MBox::is_break_line?(l = @f.gets)
yield l
end
end
@@ -138,7 +138,7 @@ class Loader < Source
end
while(line = @f.gets)
- break if line =~ BREAK_RE
+ break if MBox::is_break_line? line
next_offset = @f.tell
end
end
diff --git a/test/test_mbox_parsing.rb b/test/test_mbox_parsing.rb
@@ -115,4 +115,54 @@ EOS
assert_equal "Bob <bob@bob.com>", h["From"]
assert_nil h["To"]
end
+
+ def test_from_line_splitting
+ l = MBox::Loader.new StringIO.new(<<EOS)
+From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello there friend. How are you?
+
+From sea to shining sea
+
+From bob@bob.com I get only spam.
+
+From bob@bob.com
+
+From bob@bob.com
+
+(that second one has spaces at the endj
+
+This is the end of the email.
+EOS
+ offset, labels = l.next
+ assert_equal 0, offset
+ offset, labels = l.next
+ assert_nil offset
+ end
+
+ def test_more_from_line_splitting
+ l = MBox::Loader.new StringIO.new(<<EOS)
+From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello there friend. How are you?
+
+From bob@bob.com Mon Apr 27 12:56:19 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello again! Would you like to buy my products?
+EOS
+ offset, labels = l.next
+ assert_not_nil offset
+
+ offset, labels = l.next
+ assert_not_nil offset
+
+ offset, labels = l.next
+ assert_nil offset
+ end
end