[sup-talk] [PATCH] index: cleanup interface

Archive of RubyForge sup-talk mailing list
 help / color / mirror / Atom feed

* [sup-talk] [PATCH] index: cleanup interface
@ 2009-06-17  0:24 Rich Lane
  2009-06-17  0:24 ` [sup-talk] [PATCH] index: consistent naming Rich Lane
  0 siblings, 1 reply; 2+ messages in thread
From: Rich Lane @ 2009-06-17  0:24 UTC (permalink / raw)


Added the public methods 'each_docid', 'each_message', and 'optimize' to the
index. Removed the 'index' and 'ferret' accessors and modified their callers to
use the new methods. Bonus fixes: sup-dump no longer skips the first message
and sup_sync --start_at can now delete unseen messages.
---
 bin/sup-dump         |    6 ++----
 bin/sup-sync         |   21 +++++++--------------
 bin/sup-tweak-labels |    2 +-
 lib/sup/index.rb     |   27 ++++++++++++++++++++-------
 4 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/bin/sup-dump b/bin/sup-dump
index 29f6d6e..9b0892e 100755
--- a/bin/sup-dump
+++ b/bin/sup-dump
@@ -24,8 +24,6 @@ end
 index = Redwood::Index.new
 index.load
 
-(1 ... index.index.reader.max_doc).each do |i|
-  next if index.index.deleted? i
-  d = index.index[i]
-  puts [d[:message_id], "(" + d[:label] + ")"] * " "
+index.each_message do |m|
+  puts "#{m.id} (#{m.labels * ' '})"
 end
diff --git a/bin/sup-sync b/bin/sup-sync
index 9c342d2..a6e3478 100755
--- a/bin/sup-sync
+++ b/bin/sup-sync
@@ -208,24 +208,17 @@ begin
 
   ## delete any messages in the index that claim they're from one of
   ## these sources, but that we didn't see.
-  ##
-  ## kinda crappy code here, because we delve directly into the Ferret
-  ## API.
-  ##
-  ## TODO: move this to Index, i suppose.
-  if (target == :all || target == :changed) && !opts[:start_at]
+  if (target == :all || target == :changed)
     $stderr.puts "Deleting missing messages from the index..."
     num_del, num_scanned = 0, 0
     sources.each do |source|
       raise "no source id for #{source}" unless source.id
-      q = "+source_id:#{source.id}"
-      q += " +source_info: >= #{opts[:start_at]}" if opts[:start_at]
-      index.index.search_each(q, :limit => :all) do |docid, score|
+      index.each_message :source_id => source.id do |m|
         num_scanned += 1
-        mid = index.index[docid][:message_id]
-        unless seen[mid]
-          puts "Deleting #{mid}" if opts[:verbose]
-          index.index.delete docid unless opts[:dry_run]
+        unless seen[m.id]
+          next unless m.source_info >= opts[:start_at] if opts[:start_at]
+          puts "Deleting #{m.id}" if opts[:verbose]
+          index.drop_entry m.id unless opts[:dry_run]
           num_del += 1
         end
       end
@@ -237,7 +230,7 @@ begin
 
   if opts[:optimize]
     $stderr.puts "Optimizing index..."
-    optt = time { index.index.optimize unless opts[:dry_run] }
+    optt = time { index.optimize unless opts[:dry_run] }
     $stderr.puts "Optimized index of size #{index.size} in #{optt}s."
   end
 rescue Redwood::FatalSourceError => e
diff --git a/bin/sup-tweak-labels b/bin/sup-tweak-labels
index 538db8b..f526a95 100755
--- a/bin/sup-tweak-labels
+++ b/bin/sup-tweak-labels
@@ -118,7 +118,7 @@ begin
 
   unless num_changed == 0
     $stderr.puts "Optimizing index..."
-    index.ferret.optimize unless opts[:dry_run]
+    index.optimize unless opts[:dry_run]
   end
 
 rescue Exception => e
diff --git a/lib/sup/index.rb b/lib/sup/index.rb
index ca01ee7..037b941 100644
--- a/lib/sup/index.rb
+++ b/lib/sup/index.rb
@@ -24,11 +24,6 @@ class Index
 
   include Singleton
 
-  ## these two accessors should ONLY be used by single-threaded programs.
-  ## otherwise you will have a naughty ferret on your hands.
-  attr_reader :index
-  alias ferret index
-
   def initialize dir=BASE_DIR
     @index_mutex = Monitor.new
 
@@ -151,7 +146,7 @@ EOS
     if File.exists? dir
       Redwood::log "loading index..."
       @index_mutex.synchronize do
-        @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
+        @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer, :id_field => 'message_id')
         Redwood::log "loaded index of #{@index.size} messages"
       end
     else
@@ -171,7 +166,7 @@ EOS
         field_infos.add_field :refs
         field_infos.add_field :snippet, :index => :no, :term_vector => :no
         field_infos.create_index dir
-        @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
+        @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer, :id_field => 'message_id')
       end
     end
   end
@@ -496,6 +491,22 @@ EOS
     results.hits.map { |hit| hit.doc }
   end
 
+  def each_docid opts={}
+    query = build_query opts
+    results = @index_mutex.synchronize { @index.search query, :limit => (opts[:limit] || :all) }
+    results.hits.map { |hit| yield hit.doc }
+  end
+    
+  def each_message opts={}
+    each_docid opts do |docid|
+      yield build_message(docid)
+    end
+  end
+
+  def optimize
+    @index_mutex.synchronize { @index.optimize }
+  end
+
 protected
 
   class ParseError < StandardError; end
@@ -621,6 +632,8 @@ protected
     query.add_query Ferret::Search::TermQuery.new("label", "spam"), :must_not unless opts[:load_spam] || labels.include?(:spam)
     query.add_query Ferret::Search::TermQuery.new("label", "deleted"), :must_not unless opts[:load_deleted] || labels.include?(:deleted)
     query.add_query Ferret::Search::TermQuery.new("label", "killed"), :must_not if opts[:skip_killed]
+
+    query.add_query Ferret::Search::TermQuery.new("source_id", opts[:source_id]), :must if opts[:source_id]
     query
   end
 
-- 
1.6.0.4



^ permalink raw reply	[flat|nested] 2+ messages in thread

* [sup-talk] [PATCH] index: consistent naming
  2009-06-17  0:24 [sup-talk] [PATCH] index: cleanup interface Rich Lane
@ 2009-06-17  0:24 ` Rich Lane
  0 siblings, 0 replies; 2+ messages in thread
From: Rich Lane @ 2009-06-17  0:24 UTC (permalink / raw)


Replaced use of run_query with each_docid
Rename instances of ferret query objects to 'ferret_query'
Rename 'build_query' to 'build_ferret_query'
Rename hashes passed to index methods to 'query'
Rename 'parse_user_query_string' to 'parse_query'
Change 'parse_query' to return a query hash
Rename 'drop_entry' to 'delete' and modify callers to pass msgid
---
 bin/sup-sync                         |    2 +-
 bin/sup-tweak-labels                 |    4 +-
 lib/sup/draft.rb                     |    2 +-
 lib/sup/index.rb                     |  108 +++++++++++++++------------------
 lib/sup/modes/search-results-mode.rb |   20 +++----
 5 files changed, 63 insertions(+), 73 deletions(-)

diff --git a/bin/sup-sync b/bin/sup-sync
index a6e3478..a759cbe 100755
--- a/bin/sup-sync
+++ b/bin/sup-sync
@@ -218,7 +218,7 @@ begin
         unless seen[m.id]
           next unless m.source_info >= opts[:start_at] if opts[:start_at]
           puts "Deleting #{m.id}" if opts[:verbose]
-          index.drop_entry m.id unless opts[:dry_run]
+          index.delete m.id unless opts[:dry_run]
           num_del += 1
         end
       end
diff --git a/bin/sup-tweak-labels b/bin/sup-tweak-labels
index f526a95..6f603e2 100755
--- a/bin/sup-tweak-labels
+++ b/bin/sup-tweak-labels
@@ -2,6 +2,7 @@
 
 require 'rubygems'
 require 'trollop'
+require 'enumerator'
 require "sup"
 
 class Float
@@ -81,7 +82,8 @@ begin
   end
   query += ' ' + opts[:query] if opts[:query]
 
-  docs = Redwood::Index.run_query query
+  parsed_query = index.parse_query query
+  docs = Enumerable::Enumerator.new(index, :each_docid, parsed_query).map
   num_total = docs.size
 
   $stderr.puts "Found #{num_total} documents across #{source_ids.length} sources. Scanning..."
diff --git a/lib/sup/draft.rb b/lib/sup/draft.rb
index 32266b5..9127739 100644
--- a/lib/sup/draft.rb
+++ b/lib/sup/draft.rb
@@ -37,7 +37,7 @@ class DraftManager
       return
     end
     raise ArgumentError, "not a draft: source id #{entry[:source_id].inspect}, should be #{DraftManager.source_id.inspect} for #{m.id.inspect} / docno #{docid}" unless entry[:source_id].to_i == DraftManager.source_id
-    Index.drop_entry docid
+    Index.delete m.id
     File.delete @source.fn_for_offset(entry[:source_info])
     UpdateManager.relay self, :single_message_deleted, m
   end
diff --git a/lib/sup/index.rb b/lib/sup/index.rb
index 037b941..d15e7bb 100644
--- a/lib/sup/index.rb
+++ b/lib/sup/index.rb
@@ -279,28 +279,28 @@ EOS
   ## you should probably not call this on a block that doesn't break
   ## rather quickly because the results can be very large.
   EACH_BY_DATE_NUM = 100
-  def each_id_by_date opts={}
+  def each_id_by_date query={}
     return if empty? # otherwise ferret barfs ###TODO: remove this once my ferret patch is accepted
-    query = build_query opts
+    ferret_query = build_ferret_query query
     offset = 0
     while true
-      limit = (opts[:limit])? [EACH_BY_DATE_NUM, opts[:limit] - offset].min : EACH_BY_DATE_NUM
-      results = @index_mutex.synchronize { @index.search query, :sort => "date DESC", :limit => limit, :offset => offset }
-      Redwood::log "got #{results.total_hits} results for query (offset #{offset}) #{query.inspect}"
+      limit = (query[:limit])? [EACH_BY_DATE_NUM, query[:limit] - offset].min : EACH_BY_DATE_NUM
+      results = @index_mutex.synchronize { @index.search ferret_query, :sort => "date DESC", :limit => limit, :offset => offset }
+      Redwood::log "got #{results.total_hits} results for query (offset #{offset}) #{ferret_query.inspect}"
       results.hits.each do |hit|
         yield @index_mutex.synchronize { @index[hit.doc][:message_id] }, lambda { build_message hit.doc }
       end
-      break if opts[:limit] and offset >= opts[:limit] - limit
+      break if query[:limit] and offset >= query[:limit] - limit
       break if offset >= results.total_hits - limit
       offset += limit
     end
   end
 
-  def num_results_for opts={}
+  def num_results_for query={}
     return 0 if empty? # otherwise ferret barfs ###TODO: remove this once my ferret patch is accepted
 
-    q = build_query opts
-    @index_mutex.synchronize { @index.search(q, :limit => 1).total_hits }
+    ferret_query = build_ferret_query query
+    @index_mutex.synchronize { @index.search(ferret_query, :limit => 1).total_hits }
   end
 
   ## yield all messages in the thread containing 'm' by repeatedly
@@ -313,7 +313,7 @@ EOS
   ## is found.
   SAME_SUBJECT_DATE_LIMIT = 7
   MAX_CLAUSES = 1000
-  def each_message_in_thread_for m, opts={}
+  def each_message_in_thread_for m, query={}
     #Redwood::log "Building thread for #{m.id}: #{m.subj}"
     messages = {}
     searched = {}
@@ -332,7 +332,7 @@ EOS
       q.add_query sq, :must
       q.add_query Ferret::Search::RangeQuery.new(:date, :>= => date_min.to_indexable_s, :<= => date_max.to_indexable_s), :must
 
-      q = build_query :qobj => q
+      q = build_ferret_query :qobj => q
 
       p1 = @index_mutex.synchronize { @index.search(q).hits.map { |hit| @index[hit.doc][:message_id] } }
       Redwood::log "found #{p1.size} results for subject query #{q}"
@@ -343,7 +343,7 @@ EOS
       pending = (pending + p1 + p2).uniq
     end
 
-    until pending.empty? || (opts[:limit] && messages.size >= opts[:limit])
+    until pending.empty? || (query[:limit] && messages.size >= query[:limit])
       q = Ferret::Search::BooleanQuery.new true
       # this disappeared in newer ferrets... wtf.
       # q.max_clause_count = 2048
@@ -356,14 +356,14 @@ EOS
       end
       pending = pending[lim .. -1]
 
-      q = build_query :qobj => q
+      q = build_ferret_query :qobj => q
 
       num_queries += 1
       killed = false
       @index_mutex.synchronize do
         @index.search_each(q, :limit => :all) do |docid, score|
-          break if opts[:limit] && messages.size >= opts[:limit]
-          if @index[docid][:label].split(/\s+/).include?("killed") && opts[:skip_killed]
+          break if query[:limit] && messages.size >= query[:limit]
+          if @index[docid][:label].split(/\s+/).include?("killed") && query[:skip_killed]
             killed = true
             break
           end
@@ -419,7 +419,7 @@ EOS
   def wrap_subj subj; "__START_SUBJECT__ #{subj} __END_SUBJECT__"; end
   def unwrap_subj subj; subj =~ /__START_SUBJECT__ (.*?) __END_SUBJECT__/ && $1; end
 
-  def drop_entry docno; @index_mutex.synchronize { @index.delete docno } end
+  def delete id; @index_mutex.synchronize { @index.delete id } end
 
   def load_entry_for_id mid
     @index_mutex.synchronize do
@@ -478,27 +478,14 @@ EOS
     @index_mutex.synchronize { @index.search(q, :limit => 1).total_hits > 0 }
   end
 
-  ## takes a user query string and returns the list of docids for messages
-  ## that match the query.
-  ##
-  ## messages can then be loaded from the index with #build_message.
-  ##
-  ## raises a ParseError if the parsing failed.
-  def run_query query
-    qobj, opts = Redwood::Index.parse_user_query_string query
-    query = Redwood::Index.build_query opts.merge(:qobj => qobj)
-    results = @index.search query, :limit => (opts[:limit] || :all)
-    results.hits.map { |hit| hit.doc }
-  end
-
-  def each_docid opts={}
-    query = build_query opts
-    results = @index_mutex.synchronize { @index.search query, :limit => (opts[:limit] || :all) }
+  def each_docid query={}
+    ferret_query = build_ferret_query query
+    results = @index_mutex.synchronize { @index.search ferret_query, :limit => (query[:limit] || :all) }
     results.hits.map { |hit| yield hit.doc }
   end
     
-  def each_message opts={}
-    each_docid opts do |docid|
+  def each_message query={}
+    each_docid query do |docid|
       yield build_message(docid)
     end
   end
@@ -507,16 +494,15 @@ EOS
     @index_mutex.synchronize { @index.optimize }
   end
 
-protected
-
   class ParseError < StandardError; end
 
-  ## parse a query string from the user. returns a query object and a set of
-  ## extra flags; both of these are meant to be passed to #build_query.
+  ## parse a query string from the user. returns a query object
+  ## that can be passed to any index method with a 'query' 
+  ## argument, as well as build_ferret_query.
   ##
   ## raises a ParseError if something went wrong.
-  def parse_user_query_string s
-    extraopts = {}
+  def parse_query s
+    query = {}
 
     subs = s.gsub(/\b(to|from):(\S+)\b/) do
       field, name = $1, $2
@@ -542,8 +528,8 @@ protected
     ## final stage of query processing. if the user wants to search spam
     ## messages, not adding that is the right thing; if he doesn't want to
     ## search spam messages, then not adding it won't have any effect.
-    extraopts[:load_spam] = true if subs =~ /\blabel:spam\b/
-    extraopts[:load_deleted] = true if subs =~ /\blabel:deleted\b/
+    query[:load_spam] = true if subs =~ /\blabel:spam\b/
+    query[:load_deleted] = true if subs =~ /\blabel:deleted\b/
 
     ## gmail style "is" operator
     subs = subs.gsub(/\b(is|has):(\S+)\b/) do
@@ -552,10 +538,10 @@ protected
       when "read"
         "-label:unread"
       when "spam"
-        extraopts[:load_spam] = true
+        query[:load_spam] = true
         "label:spam"
       when "deleted"
-        extraopts[:load_deleted] = true
+        query[:load_deleted] = true
         "label:deleted"
       else
         "label:#{$2}"
@@ -601,7 +587,7 @@ protected
     subs = subs.gsub(/\blimit:(\S+)\b/) do
       lim = $1
       if lim =~ /^\d+$/
-        extraopts[:limit] = lim.to_i
+        query[:limit] = lim.to_i
         ''
       else
         raise ParseError, "non-numeric limit #{lim.inspect}"
@@ -609,32 +595,36 @@ protected
     end
     
     begin
-      [@qparser.parse(subs), extraopts]
+      query[:qobj] = @qparser.parse(subs)
+      query[:text] = s
+      query
     rescue Ferret::QueryParser::QueryParseException => e
       raise ParseError, e.message
     end
   end
 
-  def build_query opts
-    query = Ferret::Search::BooleanQuery.new
-    query.add_query opts[:qobj], :must if opts[:qobj]
-    labels = ([opts[:label]] + (opts[:labels] || [])).compact
-    labels.each { |t| query.add_query Ferret::Search::TermQuery.new("label", t.to_s), :must }
-    if opts[:participants]
+private
+
+  def build_ferret_query query
+    q = Ferret::Search::BooleanQuery.new
+    q.add_query query[:qobj], :must if query[:qobj]
+    labels = ([query[:label]] + (query[:labels] || [])).compact
+    labels.each { |t| q.add_query Ferret::Search::TermQuery.new("label", t.to_s), :must }
+    if query[:participants]
       q2 = Ferret::Search::BooleanQuery.new
-      opts[:participants].each do |p|
+      query[:participants].each do |p|
         q2.add_query Ferret::Search::TermQuery.new("from", p.email), :should
         q2.add_query Ferret::Search::TermQuery.new("to", p.email), :should
       end
-      query.add_query q2, :must
+      q.add_query q2, :must
     end
         
-    query.add_query Ferret::Search::TermQuery.new("label", "spam"), :must_not unless opts[:load_spam] || labels.include?(:spam)
-    query.add_query Ferret::Search::TermQuery.new("label", "deleted"), :must_not unless opts[:load_deleted] || labels.include?(:deleted)
-    query.add_query Ferret::Search::TermQuery.new("label", "killed"), :must_not if opts[:skip_killed]
+    q.add_query Ferret::Search::TermQuery.new("label", "spam"), :must_not unless query[:load_spam] || labels.include?(:spam)
+    q.add_query Ferret::Search::TermQuery.new("label", "deleted"), :must_not unless query[:load_deleted] || labels.include?(:deleted)
+    q.add_query Ferret::Search::TermQuery.new("label", "killed"), :must_not if query[:skip_killed]
 
-    query.add_query Ferret::Search::TermQuery.new("source_id", opts[:source_id]), :must if opts[:source_id]
-    query
+    q.add_query Ferret::Search::TermQuery.new("source_id", query[:source_id]), :must if query[:source_id]
+    q
   end
 
   def save_sources fn=Redwood::SOURCE_FN
diff --git a/lib/sup/modes/search-results-mode.rb b/lib/sup/modes/search-results-mode.rb
index 227ee9b..121e817 100644
--- a/lib/sup/modes/search-results-mode.rb
+++ b/lib/sup/modes/search-results-mode.rb
@@ -1,11 +1,9 @@
 module Redwood
 
 class SearchResultsMode < ThreadIndexMode
-  def initialize qobj, qopts = nil
-    @qobj = qobj
-    @qopts = qopts
-
-    super [], { :qobj => @qobj }.merge(@qopts)
+  def initialize query
+    @query = query
+    super [], query
   end
 
   register_keymap do |k|
@@ -13,9 +11,9 @@ class SearchResultsMode < ThreadIndexMode
   end
 
   def refine_search
-    query = BufferManager.ask :search, "refine query: ", (@qobj.to_s + " ")
-    return unless query && query !~ /^\s*$/
-    SearchResultsMode.spawn_from_query query
+    text = BufferManager.ask :search, "refine query: ", (@query[:text] + " ")
+    return unless text && text !~ /^\s*$/
+    SearchResultsMode.spawn_from_query text
   end
 
   ## a proper is_relevant? method requires some way of asking ferret
@@ -26,10 +24,10 @@ class SearchResultsMode < ThreadIndexMode
 
   def self.spawn_from_query text
     begin
-      qobj, extraopts = Index.parse_user_query_string(text)
-      return unless qobj
+      query = Index.parse_query(text)
+      return unless query
       short_text = text.length < 20 ? text : text[0 ... 20] + "..."
-      mode = SearchResultsMode.new qobj, extraopts
+      mode = SearchResultsMode.new query
       BufferManager.spawn "search: \"#{short_text}\"", mode
       mode.load_threads :num => mode.buffer.content_height
     rescue Index::ParseError => e
-- 
1.6.0.4



^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2009-06-17  0:24 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-17  0:24 [sup-talk] [PATCH] index: cleanup interface Rich Lane
2009-06-17  0:24 ` [sup-talk] [PATCH] index: consistent naming Rich Lane

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox