commit 3439894f38d1d4140bb912de0232fdb576991de0
parent ea5d039b9e27638407437b19b054854d8674437b
Author: William Morgan <wmorgan-sup@masanjin.net>
Date: Thu, 28 May 2009 10:29:59 -0400
Merge commit 'origin/scanning-speedups'
Conflicts:
bin/sup-sync
lib/sup/index.rb
Diffstat:
15 files changed, 226 insertions(+), 264 deletions(-)
diff --git a/bin/sup-sync b/bin/sup-sync
@@ -184,6 +184,14 @@ begin
## nothin! use default source labels
end
+ if Time.now - last_info_time > PROGRESS_UPDATE_INTERVAL
+ last_info_time = Time.now
+ elapsed = last_info_time - start_time
+ pctdone = source.respond_to?(:pct_done) ? source.pct_done : 100.0 * (source.cur_offset.to_f - source.start_offset).to_f / (source.end_offset - source.start_offset).to_f
+ remaining = (100.0 - pctdone) * (elapsed.to_f / pctdone)
+ $stderr.printf "## read %dm (about %.0f%%) @ %.1fm/s. %s elapsed, about %s remaining\n", num_scanned, pctdone, num_scanned / elapsed, elapsed.to_time_s, remaining.to_time_s
+ end
+
if index_state.nil?
puts "Adding message #{source}##{offset} from #{m.from} with state {#{m.labels * ', '}}" if opts[:verbose]
num_added += 1
diff --git a/lib/sup/draft.rb b/lib/sup/draft.rb
@@ -79,9 +79,7 @@ class DraftLoader < Source
def fn_for_offset o; File.join(@dir, o.to_s); end
def load_header offset
- File.open fn_for_offset(offset) do |f|
- return MBox::read_header(f)
- end
+ File.open(fn_for_offset(offset)) { |f| parse_raw_email_header f }
end
def load_message offset
diff --git a/lib/sup/imap.rb b/lib/sup/imap.rb
@@ -93,7 +93,7 @@ class IMAP < Source
def == o; o.is_a?(IMAP) && o.uri == self.uri && o.username == self.username; end
def load_header id
- MBox::read_header StringIO.new(raw_header(id))
+ parse_raw_email_header StringIO.new(raw_header(id))
end
def load_message id
diff --git a/lib/sup/index.rb b/lib/sup/index.rb
@@ -177,31 +177,31 @@ EOS
end
end
- ## Syncs the message to the index: deleting if it's already there,
- ## and adding either way. Index state will be determined by m.labels.
+ ## Syncs the message to the index, replacing any previous version. adding
+ ## either way. Index state will be determined by the message's #labels
+ ## accessor.
##
- ## docid and entry can be specified if they're already known.
- def sync_message m, docid=nil, entry=nil, opts={}
- docid, entry = load_entry_for_id m.id unless docid && entry
+ ## if need_load is false, docid and entry are assumed to be set to the
+ ## result of load_entry_for_id (which can be nil).
+ def sync_message m, need_load=true, docid=nil, entry=nil, opts={}
+ docid, entry = load_entry_for_id m.id if need_load
raise "no source info for message #{m.id}" unless m.source && m.source_info
@index_mutex.synchronize do
raise "trying to delete non-corresponding entry #{docid} with index message-id #{@index[docid][:message_id].inspect} and parameter message id #{m.id.inspect}" if docid && @index[docid][:message_id] != m.id
end
- source_id =
- if m.source.is_a? Integer
- m.source
- else
- m.source.id or raise "unregistered source #{m.source} (id #{m.source.id.inspect})"
- end
+ source_id = if m.source.is_a? Integer
+ m.source
+ else
+ m.source.id or raise "unregistered source #{m.source} (id #{m.source.id.inspect})"
+ end
- snippet =
- if m.snippet_contains_encrypted_content? && $config[:discard_snippets_from_encrypted_messages]
- ""
- else
- m.snippet
- end
+ snippet = if m.snippet_contains_encrypted_content? && $config[:discard_snippets_from_encrypted_messages]
+ ""
+ else
+ m.snippet
+ end
## write the new document to the index. if the entry already exists in the
## index, reuse it (which avoids having to reload the entry from the source,
@@ -261,15 +261,14 @@ EOS
:refs => (entry[:refs] || (m.refs + m.replytos).uniq.join(" ")),
}
- @index_mutex.synchronize do
+ @index_mutex.synchronize do
@index.delete docid if docid
@index.add_document d
end
- docid, entry = load_entry_for_id m.id
- ## this hasn't been triggered in a long time. TODO: decide whether it's still a problem.
- raise "just added message #{m.id.inspect} but couldn't find it in a search" unless docid
- true
+ ## this hasn't been triggered in a long time.
+ ## docid, entry = load_entry_for_id m.id
+ ## raise "just added message #{m.id.inspect} but couldn't find it in a search" unless docid
end
def save_index fn=File.join(@dir, "ferret")
@@ -414,9 +413,11 @@ EOS
"references" => doc[:refs].split.map { |x| "<#{x}>" }.join(" "),
}
- Message.new :source => source, :source_info => doc[:source_info].to_i,
+ m = Message.new :source => source, :source_info => doc[:source_info].to_i,
:labels => doc[:label].symbolistize,
- :snippet => doc[:snippet], :header => fake_header
+ :snippet => doc[:snippet]
+ m.parse_header fake_header
+ m
end
end
diff --git a/lib/sup/maildir.rb b/lib/sup/maildir.rb
@@ -56,7 +56,7 @@ class Maildir < Source
def load_header id
scan_mailbox
- with_file_for(id) { |f| MBox::read_header f }
+ with_file_for(id) { |f| parse_raw_email_header f }
end
def load_message id
diff --git a/lib/sup/mbox.rb b/lib/sup/mbox.rb
@@ -5,77 +5,7 @@ require "sup/rfc2047"
module Redwood
-## some utility functions. actually these are not mbox-specific at all
-## and should be moved somewhere else.
-##
-## TODO: move functionality to somewhere better, like message.rb
module MBox
- BREAK_RE = /^From \S+/
- HEADER_RE = /\s*(.*?)\s*/
-
- def read_header f
- header = {}
- last = nil
-
- ## i do it in this weird way because i am trying to speed things up
- ## when scanning over large mbox files.
- while(line = f.gets)
- case line
- ## these three can occur multiple times, and we want the first one
- when /^(Delivered-To):#{HEADER_RE}$/i,
- /^(X-Original-To):#{HEADER_RE}$/i,
- /^(Envelope-To):#{HEADER_RE}$/i: header[last = $1] ||= $2
-
- when /^(From):#{HEADER_RE}$/i,
- /^(To):#{HEADER_RE}$/i,
- /^(Cc):#{HEADER_RE}$/i,
- /^(Bcc):#{HEADER_RE}$/i,
- /^(Subject):#{HEADER_RE}$/i,
- /^(Date):#{HEADER_RE}$/i,
- /^(References):#{HEADER_RE}$/i,
- /^(In-Reply-To):#{HEADER_RE}$/i,
- /^(Reply-To):#{HEADER_RE}$/i,
- /^(List-Post):#{HEADER_RE}$/i,
- /^(List-Subscribe):#{HEADER_RE}$/i,
- /^(List-Unsubscribe):#{HEADER_RE}$/i,
- /^(Status):#{HEADER_RE}$/i,
- /^(X-\S+):#{HEADER_RE}$/: header[last = $1] = $2
- when /^(Message-Id):#{HEADER_RE}$/i: header[mid_field = last = $1] = $2
-
- when /^\r*$/: break
- when /^\S+:/: last = nil # some other header we don't care about
- else
- header[last] += " " + line.chomp.gsub(/^\s+/, "") if last
- end
- end
-
- if mid_field && header[mid_field] && header[mid_field] =~ /<(.*?)>/
- header[mid_field] = $1
- end
-
- header.each do |k, v|
- next unless Rfc2047.is_encoded? v
- header[k] =
- begin
- Rfc2047.decode_to $encoding, v
- rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
- Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}"
- v
- end
- end
- header
- end
-
- ## never actually called
- def read_body f
- body = []
- f.each_line do |l|
- break if l =~ BREAK_RE
- body << l.chomp
- end
- body
- end
-
- module_function :read_header, :read_body
+ BREAK_RE = /^From \S+/ ######### TODO REMOVE ME
end
end
diff --git a/lib/sup/mbox/loader.rb b/lib/sup/mbox/loader.rb
@@ -59,7 +59,7 @@ class Loader < Source
unless l =~ BREAK_RE
raise OutOfSyncSourceError, "mismatch in mbox file offset #{offset.inspect}: #{l.inspect}."
end
- header = MBox::read_header @f
+ header = parse_raw_email_header @f
end
header
end
@@ -98,7 +98,7 @@ class Loader < Source
@mutex.synchronize do
@f.seek offset
until @f.eof? || (l = @f.gets) =~ /^\r*$/
- ret += l
+ ret << l
end
end
ret
@@ -106,7 +106,7 @@ class Loader < Source
def raw_message offset
ret = ""
- each_raw_message_line(offset) { |l| ret += l }
+ each_raw_message_line(offset) { |l| ret << l }
ret
end
diff --git a/lib/sup/message.rb b/lib/sup/message.rb
@@ -60,49 +60,42 @@ class Message
## why.
@refs = []
- parse_header(opts[:header] || @source.load_header(@source_info))
+ #parse_header(opts[:header] || @source.load_header(@source_info))
end
def parse_header header
- header.keys.each { |k| header[k.downcase] = header[k] } # canonicalize
-
- fakeid = nil
- fakename = nil
-
- @id =
- if header["message-id"]
- sanitize_message_id header["message-id"]
- else
- fakeid = "sup-faked-" + Digest::MD5.hexdigest(raw_header)
- end
+ @id = if header["message-id"]
+ mid = header["message-id"] =~ /<(.+?)>/ ? $1 : header["message-id"]
+ sanitize_message_id mid
+ else
+ id = "sup-faked-" + Digest::MD5.hexdigest(raw_header)
+ from = header["from"]
+ #Redwood::log "faking non-existent message-id for message from #{from}: #{id}"
+ id
+ end
- @from =
- if header["from"]
- Person.from_address header["from"]
- else
- fakename = "Sup Auto-generated Fake Sender <sup@fake.sender.example.com>"
- Person.from_address fakename
- end
-
- Redwood::log "faking message-id for message from #@from: #{id}" if fakeid
- Redwood::log "faking from for message #@id: #{fakename}" if fakename
-
- date = header["date"]
- @date =
- case date
- when Time
- date
- when String
- begin
- Time.parse date
- rescue ArgumentError => e
- Redwood::log "faking date header for #{@id} due to error parsing date #{header['date'].inspect}: #{e.message}"
- Time.now
- end
- else
- Redwood::log "faking date header for #{@id}"
+ @from = Person.from_address(if header["from"]
+ header["from"]
+ else
+ name = "Sup Auto-generated Fake Sender <sup@fake.sender.example.com>"
+ #Redwood::log "faking non-existent sender for message #@id: #{name}"
+ name
+ end)
+
+ @date = case(date = header["date"])
+ when Time
+ date
+ when String
+ begin
+ Time.parse date
+ rescue ArgumentError => e
+ #Redwood::log "faking mangled date header for #{@id} (orig #{header['date'].inspect} gave error: #{e.message})"
Time.now
end
+ else
+ #Redwood::log "faking non-existent date header for #{@id}"
+ Time.now
+ end
@subj = header.member?("subject") ? header["subject"].gsub(/\s+/, " ").gsub(/\s+$/, "") : DEFAULT_SUBJECT
@to = Person.from_address_list header["to"]
@@ -130,7 +123,6 @@ class Message
@list_subscribe = header["list-subscribe"]
@list_unsubscribe = header["list-unsubscribe"]
end
- private :parse_header
def add_ref ref
@refs << ref
diff --git a/lib/sup/modes/edit-message-mode.rb b/lib/sup/modes/edit-message-mode.rb
@@ -12,7 +12,7 @@ class EditMessageMode < LineCursorMode
FORCE_HEADERS = %w(From To Cc Bcc Subject)
MULTI_HEADERS = %w(To Cc Bcc)
- NON_EDITABLE_HEADERS = %w(Message-Id Date)
+ NON_EDITABLE_HEADERS = %w(Message-id Date)
HookManager.register "signature", <<EOS
Generates a message signature.
@@ -214,7 +214,7 @@ protected
def parse_file fn
File.open(fn) do |f|
- header = MBox::read_header f
+ header = Source.parse_raw_email_header(f).inject({}) { |h, (k, v)| h[k.capitalize] = v; h } # lousy HACK
body = f.readlines.map { |l| l.chomp }
header.delete_if { |k, v| NON_EDITABLE_HEADERS.member? k }
diff --git a/lib/sup/poll.rb b/lib/sup/poll.rb
@@ -149,6 +149,8 @@ EOS
begin
m = Message.new :source => source, :source_info => offset, :labels => labels
+ m.load_from_source!
+
if m.source_marked_read?
m.remove_label :unread
labels.delete :unread
@@ -157,7 +159,7 @@ EOS
docid, entry = Index.load_entry_for_id m.id
HookManager.run "before-add-message", :message => m
m = yield(m, offset, entry) or next if block_given?
- Index.sync_message m, docid, entry, opts
+ times = Index.sync_message m, false, docid, entry, opts
UpdateManager.relay self, :added, m unless entry
rescue MessageFormatError => e
Redwood::log "ignoring erroneous message at #{source}##{offset}: #{e.message}"
diff --git a/lib/sup/source.rb b/lib/sup/source.rb
@@ -99,7 +99,49 @@ class Source
end
end
+ ## read a raw email header from a filehandle (or anything that responds to
+ ## #gets), and turn it into a hash of key-value pairs.
+ ##
+ ## WARNING! THIS IS A SPEED-CRITICAL SECTION. Everything you do here will have
+ ## a significant effect on Sup's processing speed of email from ALL sources.
+ ## Little things like string interpolation, regexp interpolation, += vs <<,
+ ## all have DRAMATIC effects. BE CAREFUL WHAT YOU DO!
+ def self.parse_raw_email_header f
+ header = {}
+ last = nil
+
+ while(line = f.gets)
+ case line
+ ## these three can occur multiple times, and we want the first one
+ when /^(Delivered-To|X-Original-To|Envelope-To):\s*(.*?)\s*$/i; header[last = $1.downcase] ||= $2
+ ## mark this guy specially. not sure why i care.
+ when /^([^:\s]+):\s*(.*?)\s*$/i; header[last = $1.downcase] = $2
+ when /^\r*$/; break
+ else
+ if last
+ header[last] << " " unless header[last].empty?
+ header[last] << line.strip
+ end
+ end
+ end
+
+ %w(subject from to cc bcc).each do |k|
+ v = header[k] or next
+ next unless Rfc2047.is_encoded? v
+ header[k] = begin
+ Rfc2047.decode_to $encoding, v
+ rescue Errno::EINVAL, Iconv::InvalidEncoding, Iconv::IllegalSequence => e
+ #Redwood::log "warning: error decoding RFC 2047 header (#{e.class.name}): #{e.message}"
+ v
+ end
+ end
+ header
+ end
+
protected
+
+ ## convenience function
+ def parse_raw_email_header f; self.class.parse_raw_email_header f end
def Source.expand_filesystem_uri uri
uri.gsub "~", File.expand_path("~")
diff --git a/test/dummy_source.rb b/test/dummy_source.rb
@@ -26,7 +26,7 @@ class DummySource < Source
end
def load_header offset
- MBox::read_header StringIO.new(raw_header(offset))
+ Source.parse_raw_email_header StringIO.new(raw_header(offset))
end
def load_message offset
diff --git a/test/test_header_parsing.rb b/test/test_header_parsing.rb
@@ -0,0 +1,107 @@
+#!/usr/bin/ruby
+
+require 'test/unit'
+require 'sup'
+require 'stringio'
+
+include Redwood
+
+class TestMBoxParsing < Test::Unit::TestCase
+ def setup
+ end
+
+ def teardown
+ end
+
+ def test_normal_headers
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+To: Sally <sally@sally.com>
+EOS
+
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_equal "Sally <sally@sally.com>", h["to"]
+ assert_nil h["message-id"]
+ end
+
+ def test_multiline
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+Subject: one two three
+ four five six
+To: Sally <sally@sally.com>
+References: <seven>
+ <eight>
+Seven: Eight
+EOS
+
+ assert_equal "one two three four five six", h["subject"]
+ assert_equal "Sally <sally@sally.com>", h["to"]
+ assert_equal "<seven> <eight>", h["references"]
+ end
+
+ def test_ignore_spacing
+ variants = [
+ "Subject:one two three end\n",
+ "Subject: one two three end\n",
+ "Subject: one two three end \n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "one two three end", h["subject"]
+ end
+ end
+
+ def test_message_id_ignore_spacing
+ variants = [
+ "Message-Id: <one@bob.com> \n",
+ "Message-Id:<one@bob.com> \n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "<one@bob.com>", h["message-id"]
+ end
+ end
+
+ def test_blank_lines
+ h = Source.parse_raw_email_header StringIO.new("")
+ assert_equal nil, h["message-id"]
+ end
+
+ def test_empty_headers
+ variants = [
+ "Message-Id: \n",
+ "Message-Id:\n",
+ ]
+ variants.each do |s|
+ h = Source.parse_raw_email_header StringIO.new(s)
+ assert_equal "", h["message-id"]
+ end
+ end
+
+ def test_detect_end_of_headers
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+\r
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+
+ h = Source.parse_raw_email_header StringIO.new(<<EOS)
+From: Bob <bob@bob.com>
+\r\n\r
+To: a dear friend
+EOS
+ assert_equal "Bob <bob@bob.com>", h["from"]
+ assert_nil h["to"]
+ end
+end
diff --git a/test/test_mbox_parsing.rb b/test/test_mbox_parsing.rb
@@ -1,118 +0,0 @@
-#!/usr/bin/ruby
-
-require 'test/unit'
-require 'sup'
-require 'stringio'
-
-include Redwood
-
-class TestMBoxParsing < Test::Unit::TestCase
- def setup
- end
-
- def teardown
- end
-
- def test_normal_headers
- h = MBox.read_header StringIO.new(<<EOS)
-From: Bob <bob@bob.com>
-To: Sally <sally@sally.com>
-EOS
-
- assert_equal "Bob <bob@bob.com>", h["From"]
- assert_equal "Sally <sally@sally.com>", h["To"]
- assert_nil h["Message-Id"]
- end
-
- ## this is shitty behavior in retrospect, but it's built in now.
- def test_message_id_stripping
- h = MBox.read_header StringIO.new("Message-Id: <one@bob.com>\n")
- assert_equal "one@bob.com", h["Message-Id"]
-
- h = MBox.read_header StringIO.new("Message-Id: one@bob.com\n")
- assert_equal "one@bob.com", h["Message-Id"]
- end
-
- def test_multiline
- h = MBox.read_header StringIO.new(<<EOS)
-From: Bob <bob@bob.com>
-Subject: one two three
- four five six
-To: Sally <sally@sally.com>
-References: seven
- eight
-Seven: Eight
-EOS
-
- assert_equal "one two three four five six", h["Subject"]
- assert_equal "Sally <sally@sally.com>", h["To"]
- assert_equal "seven eight", h["References"]
- end
-
- def test_ignore_spacing
- variants = [
- "Subject:one two three end\n",
- "Subject: one two three end\n",
- "Subject: one two three end \n",
- ]
- variants.each do |s|
- h = MBox.read_header StringIO.new(s)
- assert_equal "one two three end", h["Subject"]
- end
- end
-
- def test_message_id_ignore_spacing
- variants = [
- "Message-Id: <one@bob.com> \n",
- "Message-Id: one@bob.com \n",
- "Message-Id:<one@bob.com> \n",
- "Message-Id:one@bob.com \n",
- ]
- variants.each do |s|
- h = MBox.read_header StringIO.new(s)
- assert_equal "one@bob.com", h["Message-Id"]
- end
- end
-
- def test_blank_lines
- h = MBox.read_header StringIO.new("")
- assert_equal nil, h["Message-Id"]
- end
-
- def test_empty_headers
- variants = [
- "Message-Id: \n",
- "Message-Id:\n",
- ]
- variants.each do |s|
- h = MBox.read_header StringIO.new(s)
- assert_equal "", h["Message-Id"]
- end
- end
-
- def test_detect_end_of_headers
- h = MBox.read_header StringIO.new(<<EOS)
-From: Bob <bob@bob.com>
-
-To: a dear friend
-EOS
- assert_equal "Bob <bob@bob.com>", h["From"]
- assert_nil h["To"]
-
- h = MBox.read_header StringIO.new(<<EOS)
-From: Bob <bob@bob.com>
-\r
-To: a dear friend
-EOS
- assert_equal "Bob <bob@bob.com>", h["From"]
- assert_nil h["To"]
-
- h = MBox.read_header StringIO.new(<<EOS)
-From: Bob <bob@bob.com>
-\r\n\r
-To: a dear friend
-EOS
- assert_equal "Bob <bob@bob.com>", h["From"]
- assert_nil h["To"]
- end
-end
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -511,7 +511,7 @@ EOS
# Look at another header field whose first line was blank.
list_unsubscribe = sup_message.list_unsubscribe
- assert_equal(" <http://mailman2.widget.com/mailman/listinfo/monitor-list>, " +
+ assert_equal("<http://mailman2.widget.com/mailman/listinfo/monitor-list>, " +
"<mailto:monitor-list-request@widget.com?subject=unsubscribe>",
list_unsubscribe)