* [sup-talk] [PATCH] xapian: do less work for update_message_state
@ 2009-09-13 18:44 Rich Lane
2009-09-30 19:40 ` William Morgan
0 siblings, 1 reply; 8+ messages in thread
From: Rich Lane @ 2009-09-13 18:44 UTC (permalink / raw)
Refactor index_message so that we do the minimal amount of work based on what
state the user has modified.
---
lib/sup/xapian_index.rb | 241 +++++++++++++++++++++++++++--------------------
1 files changed, 137 insertions(+), 104 deletions(-)
diff --git a/lib/sup/xapian_index.rb b/lib/sup/xapian_index.rb
index e1cfe65..ad45b0e 100644
--- a/lib/sup/xapian_index.rb
+++ b/lib/sup/xapian_index.rb
@@ -42,8 +42,6 @@ EOS
@xapian = Xapian::WritableDatabase.new(path, Xapian::DB_CREATE)
@xapian.set_metadata 'version', INDEX_VERSION
end
- @term_generator = Xapian::TermGenerator.new()
- @term_generator.stemmer = Xapian::Stem.new(STEM_LANGUAGE)
@enquire = Xapian::Enquire.new @xapian
@enquire.weighting_scheme = Xapian::BoolWeight.new
@enquire.docid_order = Xapian::Enquire::ASCENDING
@@ -91,41 +89,9 @@ EOS
m
end
- def add_message m; sync_message m end
- def update_message m; sync_message m end
- def update_message_state m; sync_message m end
-
- def sync_message m, opts={}
- entry = synchronize { get_entry m.id }
- snippet = m.snippet
- entry ||= {}
- labels = m.labels
- entry = {} if opts[:force_overwrite]
-
- d = {
- :message_id => m.id,
- :source_id => m.source.id,
- :source_info => m.source_info,
- :date => (entry[:date] || m.date),
- :snippet => snippet,
- :labels => labels,
- :from => (entry[:from] || [m.from.email, m.from.name]),
- :to => (entry[:to] || m.to.map { |p| [p.email, p.name] }),
- :cc => (entry[:cc] || m.cc.map { |p| [p.email, p.name] }),
- :bcc => (entry[:bcc] || m.bcc.map { |p| [p.email, p.name] }),
- :subject => m.subj,
- :refs => (entry[:refs] || m.refs),
- :replytos => (entry[:replytos] || m.replytos),
- }
-
- labels.each { |l| LabelManager << l }
-
- synchronize do
- index_message m, d, opts
- end
- true
- end
- private :sync_message
+ def add_message m; sync_message m, true end
+ def update_message m; sync_message m, true end
+ def update_message_state m; sync_message m, false end
def num_results_for query={}
xapian_query = build_xapian_query query
@@ -153,7 +119,6 @@ EOS
def each_message_in_thread_for m, opts={}
# TODO thread by subject
- # TODO handle killed threads
return unless doc = find_doc(m.id)
queue = doc.value(THREAD_VALUENO).split(',')
msgids = [m.id]
@@ -438,100 +403,140 @@ EOS
end
end
- def index_message m, entry, opts
- terms = []
- text = []
+ def sync_message m, overwrite
+ doc = synchronize { find_doc(m.id) }
+ existed = doc != nil
+ doc ||= Xapian::Document.new
+ do_index_static = overwrite || !existed
+ old_entry = !do_index_static && doc.entry
+ snippet = do_index_static ? m.snippet : old_entry[:snippet]
- subject_text = m.indexable_subject
- body_text = m.indexable_body
+ entry = {
+ :message_id => m.id,
+ :source_id => m.source.id,
+ :source_info => m.source_info,
+ :date => m.date,
+ :snippet => snippet,
+ :labels => m.labels.to_a,
+ :from => [m.from.email, m.from.name],
+ :to => m.to.map { |p| [p.email, p.name] },
+ :cc => m.cc.map { |p| [p.email, p.name] },
+ :bcc => m.bcc.map { |p| [p.email, p.name] },
+ :subject => m.subj,
+ :refs => m.refs.to_a,
+ :replytos => m.replytos.to_a,
+ }
+ if do_index_static
+ doc.clear_terms
+ doc.clear_values
+ index_message_static m, doc, entry
+ end
+
+ index_message_threading doc, entry, old_entry
+ index_message_labels doc, entry[:labels], (do_index_static ? [] : old_entry[:labels])
+ doc.entry = entry
+
+ synchronize do
+ unless docid = existed ? doc.docid : assign_docid(m, truncate_date(m.date))
+ # Could be triggered by spam
+ warn "docid underflow, dropping #{m.id.inspect}"
+ return
+ end
+ @xapian.replace_document docid, doc
+ end
+
+ m.labels.each { |l| LabelManager << l }
+ true
+ end
+
+ ## Index content that can't be changed by the user
+ def index_message_static m, doc, entry
# Person names are indexed with several prefixes
person_termer = lambda do |d|
lambda do |p|
["#{d}_name", "name", "body"].each do |x|
- text << [p.name, PREFIX[x]]
+ doc.index_text p.name, PREFIX[x]
end if p.name
- [d, :any].each { |x| terms << mkterm(:email, x, p.email) }
+ [d, :any].each { |x| doc.add_term mkterm(:email, x, p.email) }
end
end
person_termer[:from][m.from] if m.from
(m.to+m.cc+m.bcc).each(&(person_termer[:to]))
- terms << mkterm(:date,m.date) if m.date
- m.labels.each { |t| terms << mkterm(:label,t) }
- terms << mkterm(:type, 'mail')
- terms << mkterm(:msgid, m.id)
- terms << mkterm(:source_id, m.source.id)
+ # Full text search content
+ subject_text = m.indexable_subject
+ body_text = m.indexable_body
+ doc.index_text subject_text, PREFIX['subject']
+ doc.index_text subject_text, PREFIX['body']
+ doc.index_text body_text, PREFIX['body']
+ m.attachments.each { |a| doc.index_text a, PREFIX['attachment'] }
+
+ # Miscellaneous terms
+ doc.add_term mkterm(:date, m.date) if m.date
+ doc.add_term mkterm(:type, 'mail')
+ doc.add_term mkterm(:msgid, m.id)
+ doc.add_term mkterm(:source_id, m.source.id)
m.attachments.each do |a|
a =~ /\.(\w+)$/ or next
- t = mkterm(:attachment_extension, $1)
- terms << t
+ doc.add_term mkterm(:attachment_extension, $1)
+ end
+
+ # Date value for range queries
+ date_value = begin
+ Xapian.sortable_serialise m.date.to_i
+ rescue TypeError
+ Xapian.sortable_serialise 0
end
- ## Thread membership
- children = term_docids(mkterm(:ref, m.id)).map { |docid| @xapian.document docid }
- parent_ids = m.refs + m.replytos
+ doc.add_value MSGID_VALUENO, m.id
+ doc.add_value DATE_VALUENO, date_value
+ end
+
+ def index_message_labels doc, new_labels, old_labels
+ return if new_labels == old_labels
+ added = new_labels.to_a - old_labels.to_a
+ removed = old_labels.to_a - new_labels.to_a
+ added.each { |t| doc.add_term mkterm(:label,t) }
+ removed.each { |t| doc.remove_term mkterm(:label,t) }
+ end
+
+ ## Assign a set of thread ids to the document. This is a hybrid of the runtime
+ ## search done by the Ferret index and the index-time union done by previous
+ ## versions of the Xapian index. We first find the thread ids of all messages
+ ## with a reference to or from us. If that set is empty, we use our own
+ ## message id. Otherwise, we use all the thread ids we previously found. In
+ ## the common case there's only one member in that set, but if we're the
+ ## missing link between multiple previously unrelated threads we can have
+ ## more. XapianIndex#each_message_in_thread_for follows the thread ids when
+ ## searching so the user sees a single unified thread.
+ def index_message_threading doc, entry, old_entry
+ return if old_entry && (entry[:refs] == old_entry[:refs]) && (entry[:replytos] == old_entry[:replytos])
+ children = term_docids(mkterm(:ref, entry[:message_id])).map { |docid| @xapian.document docid }
+ parent_ids = entry[:refs] + entry[:replytos]
parents = parent_ids.map { |id| find_doc id }.compact
thread_members = SavingHash.new { [] }
(children + parents).each do |doc2|
thread_ids = doc2.value(THREAD_VALUENO).split ','
thread_ids.each { |thread_id| thread_members[thread_id] << doc2 }
end
+ thread_ids = thread_members.empty? ? [entry[:message_id]] : thread_members.keys
+ thread_ids.each { |thread_id| doc.add_term mkterm(:thread, thread_id) }
+ parent_ids.each { |ref| doc.add_term mkterm(:ref, ref) }
+ doc.add_value THREAD_VALUENO, (thread_ids * ',')
+ end
- thread_ids = thread_members.empty? ? [m.id] : thread_members.keys
-
- thread_ids.each { |thread_id| terms << mkterm(:thread, thread_id) }
- parent_ids.each do |ref|
- terms << mkterm(:ref, ref)
- end
-
- # Full text search content
- text << [subject_text, PREFIX['subject']]
- text << [subject_text, PREFIX['body']]
- text << [body_text, PREFIX['body']]
- m.attachments.each { |a| text << [a, PREFIX['attachment']] }
-
- truncated_date = if m.date < MIN_DATE
- debug "warning: adjusting too-low date #{m.date} for indexing"
+ def truncate_date date
+ if date < MIN_DATE
+ debug "warning: adjusting too-low date #{date} for indexing"
MIN_DATE
- elsif m.date > MAX_DATE
- debug "warning: adjusting too-high date #{m.date} for indexing"
+ elsif date > MAX_DATE
+ debug "warning: adjusting too-high date #{date} for indexing"
MAX_DATE
else
- m.date
- end
-
- # Date value for range queries
- date_value = begin
- Xapian.sortable_serialise truncated_date.to_i
- rescue TypeError
- Xapian.sortable_serialise 0
- end
-
- docid = nil
- unless doc = find_doc(m.id)
- doc = Xapian::Document.new
- if not docid = assign_docid(m, truncated_date)
- # Could be triggered by spam
- Redwood::log "warning: docid underflow, dropping #{m.id.inspect}"
- return
- end
- else
- doc.clear_terms
- doc.clear_values
- docid = doc.docid
+ date
end
-
- @term_generator.document = doc
- text.each { |text,prefix| @term_generator.index_text text, 1, prefix }
- terms.each { |term| doc.add_term term if term.length <= MAX_TERM_LENGTH }
- doc.add_value MSGID_VALUENO, m.id
- doc.add_value THREAD_VALUENO, (thread_ids * ',')
- doc.add_value DATE_VALUENO, date_value
- doc.data = Marshal.dump entry
-
- @xapian.replace_document docid, doc
end
# Construct a Xapian term
@@ -561,6 +566,34 @@ EOS
end
end
+ module DocumentMethods
+ def entry
+ Marshal.load data
+ end
+
+ def entry=(x)
+ self.data = Marshal.dump x
+ end
+
+ def index_text text, prefix, weight=1
+ term_generator = Xapian::TermGenerator.new
+ term_generator.stemmer = Xapian::Stem.new(STEM_LANGUAGE)
+ term_generator.document = self
+ term_generator.index_text text, weight, prefix
+ end
+
+ def add_term term
+ if term.length <= MAX_TERM_LENGTH
+ super term
+ else
+ warn "dropping excessively long term #{term}"
+ end
+ end
+ end
+end
+
end
+class Xapian::Document
+ include Redwood::XapianIndex::DocumentMethods
end
--
1.6.4.2
^ permalink raw reply [flat|nested] 8+ messages in thread
* [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-09-13 18:44 [sup-talk] [PATCH] xapian: do less work for update_message_state Rich Lane
@ 2009-09-30 19:40 ` William Morgan
2009-09-30 20:16 ` Rich Lane
0 siblings, 1 reply; 8+ messages in thread
From: William Morgan @ 2009-09-30 19:40 UTC (permalink / raw)
Reformatted excerpts from Rich Lane's message of 2009-09-13:
> Refactor index_message so that we do the minimal amount of work based
> on what state the user has modified.
Branch xapian-message-state, merged into next. Thanks!
BTW I'm excited about the direction this is going. State changes on
large numbers of messages seem significantly faster with this.
--
William <wmorgan-sup at masanjin.net>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-09-30 19:40 ` William Morgan
@ 2009-09-30 20:16 ` Rich Lane
2009-10-01 13:46 ` William Morgan
0 siblings, 1 reply; 8+ messages in thread
From: Rich Lane @ 2009-09-30 20:16 UTC (permalink / raw)
Excerpts from William Morgan's message of Wed Sep 30 15:40:32 -0400 2009:
> Reformatted excerpts from Rich Lane's message of 2009-09-13:
> > Refactor index_message so that we do the minimal amount of work based
> > on what state the user has modified.
>
> Branch xapian-message-state, merged into next. Thanks!
>
> BTW I'm excited about the direction this is going. State changes on
> large numbers of messages seem significantly faster with this.
They're about 3 times faster on my machine with this patch. An
optimization the Xapian devs have been planning to make (and that this
patch is necessary to take advantage of) should increase performance
much more.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-09-30 20:16 ` Rich Lane
@ 2009-10-01 13:46 ` William Morgan
2009-10-01 17:02 ` Rich Lane
0 siblings, 1 reply; 8+ messages in thread
From: William Morgan @ 2009-10-01 13:46 UTC (permalink / raw)
Reformatted excerpts from Rich Lane's message of 2009-09-30:
> They're about 3 times faster on my machine with this patch. An
> optimization the Xapian devs have been planning to make (and that this
> patch is necessary to take advantage of) should increase performance
> much more.
Awesome. Out of curiousity, what's the optimization?
--
William <wmorgan-sup at masanjin.net>
^ permalink raw reply [flat|nested] 8+ messages in thread
* [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-10-01 13:46 ` William Morgan
@ 2009-10-01 17:02 ` Rich Lane
2009-10-02 7:57 ` Olly Betts
0 siblings, 1 reply; 8+ messages in thread
From: Rich Lane @ 2009-10-01 17:02 UTC (permalink / raw)
Excerpts from William Morgan's message of Thu Oct 01 09:46:20 -0400 2009:
> Reformatted excerpts from Rich Lane's message of 2009-09-30:
> > They're about 3 times faster on my machine with this patch. An
> > optimization the Xapian devs have been planning to make (and that this
> > patch is necessary to take advantage of) should increase performance
> > much more.
>
> Awesome. Out of curiousity, what's the optimization?
replace_document currently deletes all the old postings and inserts new
ones. It can be optimized to make the minimal set of modifications.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-10-01 17:02 ` Rich Lane
@ 2009-10-02 7:57 ` Olly Betts
2010-02-04 3:42 ` Olly Betts
0 siblings, 1 reply; 8+ messages in thread
From: Olly Betts @ 2009-10-02 7:57 UTC (permalink / raw)
On 2009-10-01, Rich Lane <rlane at club.cc.cmu.edu> wrote:
> Excerpts from William Morgan's message of Thu Oct 01 09:46:20 -0400 2009:
>> Reformatted excerpts from Rich Lane's message of 2009-09-30:
>> > They're about 3 times faster on my machine with this patch. An
>> > optimization the Xapian devs have been planning to make (and that this
>> > patch is necessary to take advantage of) should increase performance
>> > much more.
>>
>> Awesome. Out of curiousity, what's the optimization?
>
> replace_document currently deletes all the old postings and inserts new
> ones. It can be optimized to make the minimal set of modifications.
This is the ticket for it:
http://trac.xapian.org/ticket/250
Cheers,
Olly
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [sup-talk] [PATCH] xapian: do less work for update_message_state
2009-10-02 7:57 ` Olly Betts
@ 2010-02-04 3:42 ` Olly Betts
2010-02-18 2:27 ` Olly Betts
0 siblings, 1 reply; 8+ messages in thread
From: Olly Betts @ 2010-02-04 3:42 UTC (permalink / raw)
To: sup-talk
Olly Betts writes:
> On 2009-10-01, Rich Lane <rlane@club.cc.cmu.edu> wrote:
> > Excerpts from William Morgan's message of Thu Oct 01 09:46:20 -0400 2009:
> >> Reformatted excerpts from Rich Lane's message of 2009-09-30:
> >> > They're about 3 times faster on my machine with this patch. An
> >> > optimization the Xapian devs have been planning to make (and that this
> >> > patch is necessary to take advantage of) should increase performance
> >> > much more.
> >>
> >> Awesome. Out of curiousity, what's the optimization?
> >
> > replace_document currently deletes all the old postings and inserts new
> > ones. It can be optimized to make the minimal set of modifications.
>
> This is the ticket for it:
>
> http://trac.xapian.org/ticket/250
This has now been fixed in Xapian SVN trunk, and I've backported the
patches to Xapian's 1.0 branch. You can find snapshot tarballs including
these changes here:
http://oligarchy.co.uk/xapian/branches/1.0/
Xapian's testsuite passes (including the additional test coverage which I
also backported), and I looked over each change carefully, but I would be
interested to see some real world testing, particularly in the situation
which these changes are intended to improve (i.e. speed of adding/removing
tag terms from existing documents). I'm not expecting problems, but this
is a larger change than we usually backport, so wider testing would be a
good thing.
If you are using Xapian 1.0's default backend (flint), you'll need this
patch (which was merged before sup 0.10) to get full benefit from the
changes:
http://article.gmane.org/gmane.mail.sup.devel/177
I've uploaded a snapshot of 1.0 with this patch to Debian experimental.
It hasn't built for all Debian architectures yet, but is available for at
least amd64 and x86, which are probably the most popular two.
If you aren't sure how to pull in packages from experimental, see:
http://wiki.debian.org/DebianExperimental
I've also put it in a Launchpad PPA for all currently supported Ubuntu
releases, and it has built for all of them already:
https://launchpad.net/~ojwb/+archive/experimental/
Cheers,
Olly
_______________________________________________
sup-talk mailing list
sup-talk@rubyforge.org
http://rubyforge.org/mailman/listinfo/sup-talk
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [sup-talk] [PATCH] xapian: do less work for update_message_state
2010-02-04 3:42 ` Olly Betts
@ 2010-02-18 2:27 ` Olly Betts
0 siblings, 0 replies; 8+ messages in thread
From: Olly Betts @ 2010-02-18 2:27 UTC (permalink / raw)
To: sup-talk
On 2010-02-04, Olly Betts <olly@survex.com> wrote:
> Olly Betts writes:
>> On 2009-10-01, Rich Lane <rlane@club.cc.cmu.edu> wrote:
>> > Excerpts from William Morgan's message of Thu Oct 01 09:46:20 -0400 2009:
>> >> Reformatted excerpts from Rich Lane's message of 2009-09-30:
>> >> > They're about 3 times faster on my machine with this patch. An
>> >> > optimization the Xapian devs have been planning to make (and that this
>> >> > patch is necessary to take advantage of) should increase performance
>> >> > much more.
>> >>
>> >> Awesome. Out of curiousity, what's the optimization?
>> >
>> > replace_document currently deletes all the old postings and inserts new
>> > ones. It can be optimized to make the minimal set of modifications.
>>
>> This is the ticket for it:
>>
>> http://trac.xapian.org/ticket/250
This fix went into the recent Xapian 1.0.18 release (and also in 1.1.4, which
is a release candidate for 1.2.0).
Cheers,
Olly
_______________________________________________
sup-talk mailing list
sup-talk@rubyforge.org
http://rubyforge.org/mailman/listinfo/sup-talk
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2010-02-18 2:28 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-13 18:44 [sup-talk] [PATCH] xapian: do less work for update_message_state Rich Lane
2009-09-30 19:40 ` William Morgan
2009-09-30 20:16 ` Rich Lane
2009-10-01 13:46 ` William Morgan
2009-10-01 17:02 ` Rich Lane
2009-10-02 7:57 ` Olly Betts
2010-02-04 3:42 ` Olly Betts
2010-02-18 2:27 ` Olly Betts
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox