commit 47133afaad9c0a0eb34cf1b4fb77a251388a4359
parent d41f2472b3c45b3fbaaa9e02cc7ac62c4295c575
Author: William Morgan <wmorgan-sup@masanjin.net>
Date: Mon, 18 May 2009 07:10:27 -0700
Merge branch 'various-mbox-fixes' into next
Conflicts:
lib/sup/mbox.rb
test/test_mbox_parsing.rb
Diffstat:
3 files changed, 72 insertions(+), 8 deletions(-)
diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb
@@ -6,6 +6,20 @@ require "sup/rfc2047"
module Redwood
module MBox
- BREAK_RE = /^From \S+@\S+ /
+ BREAK_RE = /^From \S+ (.+)$/
+
+ def is_break_line? l
+ l =~ BREAK_RE or return false
+ time = $1
+ begin
+ ## hack -- make Time.parse fail when trying to substitute values from Time.now
+ Time.parse time, 0
+ true
+ rescue NoMethodError
+ Redwood::log "found invalid date in potential mbox split line, not splitting: #{l.inspect}"
+ false
+ end
+ end
+ module_function :is_break_line?
end
end
diff --git a/lib/sup/mbox/loader.rb b/lib/sup/mbox/loader.rb
@@ -9,7 +9,7 @@ class Loader < Source
attr_accessor :labels
## uri_or_fp is horrific. need to refactor.
- def initialize uri_or_fp, start_offset=nil, usual=true, archived=false, id=nil, labels=[]
+ def initialize uri_or_fp, start_offset=0, usual=true, archived=false, id=nil, labels=[]
@mutex = Mutex.new
@labels = ((labels || []) - LabelManager::RESERVED_LABELS).uniq.freeze
@@ -56,7 +56,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek offset
l = @f.gets
- unless l =~ BREAK_RE
+ unless MBox::is_break_line? l
raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}."
end
header = parse_raw_email_header @f
@@ -72,7 +72,7 @@ class Loader < Source
## "From" at the start of a message body line.
string = ""
l = @f.gets
- string << l until @f.eof? || (l = @f.gets) =~ BREAK_RE
+ string << l until @f.eof? || MBox::is_break_line?(l = @f.gets)
RMail::Parser.read string
rescue RMail::Parser::Error => e
raise FatalSourceError, "error parsing mbox file: #{e.message}"
@@ -85,7 +85,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek cur_offset
string = ""
- until @f.eof? || (l = @f.gets) =~ BREAK_RE
+ until @f.eof? || MBox::is_break_line?(l = @f.gets)
string << l
end
self.cur_offset += string.length
@@ -119,7 +119,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek offset
yield @f.gets
- until @f.eof? || (l = @f.gets) =~ BREAK_RE
+ until @f.eof? || MBox::is_break_line?(l = @f.gets)
yield l
end
end
@@ -140,7 +140,7 @@ class Loader < Source
## 2. at the beginning of an mbox separator (in all other
## cases).
- l = @f.gets or raise "next while at EOF"
+ l = @f.gets or return nil
if l =~ /^\s*$/ # case 1
returned_offset = @f.tell
@f.gets # now we're at a BREAK_RE, so skip past it
@@ -150,7 +150,7 @@ class Loader < Source
end
while(line = @f.gets)
- break if line =~ BREAK_RE
+ break if MBox::is_break_line? line
next_offset = @f.tell
end
end
diff --git a/test/test_header_parsing.rb b/test/test_header_parsing.rb
@@ -104,4 +104,54 @@ EOS
assert_equal "Bob <bob@bob.com>", h["from"]
assert_nil h["to"]
end
+
+ def test_from_line_splitting
+ l = MBox::Loader.new StringIO.new(<<EOS)
+From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello there friend. How are you?
+
+From sea to shining sea
+
+From bob@bob.com I get only spam.
+
+From bob@bob.com
+
+From bob@bob.com
+
+(that second one has spaces at the endj
+
+This is the end of the email.
+EOS
+ offset, labels = l.next
+ assert_equal 0, offset
+ offset, labels = l.next
+ assert_nil offset
+ end
+
+ def test_more_from_line_splitting
+ l = MBox::Loader.new StringIO.new(<<EOS)
+From sup-talk-bounces@rubyforge.org Mon Apr 27 12:56:18 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello there friend. How are you?
+
+From bob@bob.com Mon Apr 27 12:56:19 2009
+From: Bob <bob@bob.com>
+To: a dear friend
+
+Hello again! Would you like to buy my products?
+EOS
+ offset, labels = l.next
+ assert_not_nil offset
+
+ offset, labels = l.next
+ assert_not_nil offset
+
+ offset, labels = l.next
+ assert_nil offset
+ end
end