From: rlane@club.cc.cmu.edu (Rich Lane)
Subject: [sup-talk] [PATCH] index: cleanup interface
Date: Tue, 16 Jun 2009 17:24:58 -0700 [thread overview]
Message-ID: <1245198299-18742-1-git-send-email-rlane@club.cc.cmu.edu> (raw)
Added the public methods 'each_docid', 'each_message', and 'optimize' to the
index. Removed the 'index' and 'ferret' accessors and modified their callers to
use the new methods. Bonus fixes: sup-dump no longer skips the first message
and sup_sync --start_at can now delete unseen messages.
---
bin/sup-dump | 6 ++----
bin/sup-sync | 21 +++++++--------------
bin/sup-tweak-labels | 2 +-
lib/sup/index.rb | 27 ++++++++++++++++++++-------
4 files changed, 30 insertions(+), 26 deletions(-)
diff --git a/bin/sup-dump b/bin/sup-dump
index 29f6d6e..9b0892e 100755
--- a/bin/sup-dump
+++ b/bin/sup-dump
@@ -24,8 +24,6 @@ end
index = Redwood::Index.new
index.load
-(1 ... index.index.reader.max_doc).each do |i|
- next if index.index.deleted? i
- d = index.index[i]
- puts [d[:message_id], "(" + d[:label] + ")"] * " "
+index.each_message do |m|
+ puts "#{m.id} (#{m.labels * ' '})"
end
diff --git a/bin/sup-sync b/bin/sup-sync
index 9c342d2..a6e3478 100755
--- a/bin/sup-sync
+++ b/bin/sup-sync
@@ -208,24 +208,17 @@ begin
## delete any messages in the index that claim they're from one of
## these sources, but that we didn't see.
- ##
- ## kinda crappy code here, because we delve directly into the Ferret
- ## API.
- ##
- ## TODO: move this to Index, i suppose.
- if (target == :all || target == :changed) && !opts[:start_at]
+ if (target == :all || target == :changed)
$stderr.puts "Deleting missing messages from the index..."
num_del, num_scanned = 0, 0
sources.each do |source|
raise "no source id for #{source}" unless source.id
- q = "+source_id:#{source.id}"
- q += " +source_info: >= #{opts[:start_at]}" if opts[:start_at]
- index.index.search_each(q, :limit => :all) do |docid, score|
+ index.each_message :source_id => source.id do |m|
num_scanned += 1
- mid = index.index[docid][:message_id]
- unless seen[mid]
- puts "Deleting #{mid}" if opts[:verbose]
- index.index.delete docid unless opts[:dry_run]
+ unless seen[m.id]
+ next unless m.source_info >= opts[:start_at] if opts[:start_at]
+ puts "Deleting #{m.id}" if opts[:verbose]
+ index.drop_entry m.id unless opts[:dry_run]
num_del += 1
end
end
@@ -237,7 +230,7 @@ begin
if opts[:optimize]
$stderr.puts "Optimizing index..."
- optt = time { index.index.optimize unless opts[:dry_run] }
+ optt = time { index.optimize unless opts[:dry_run] }
$stderr.puts "Optimized index of size #{index.size} in #{optt}s."
end
rescue Redwood::FatalSourceError => e
diff --git a/bin/sup-tweak-labels b/bin/sup-tweak-labels
index 538db8b..f526a95 100755
--- a/bin/sup-tweak-labels
+++ b/bin/sup-tweak-labels
@@ -118,7 +118,7 @@ begin
unless num_changed == 0
$stderr.puts "Optimizing index..."
- index.ferret.optimize unless opts[:dry_run]
+ index.optimize unless opts[:dry_run]
end
rescue Exception => e
diff --git a/lib/sup/index.rb b/lib/sup/index.rb
index ca01ee7..037b941 100644
--- a/lib/sup/index.rb
+++ b/lib/sup/index.rb
@@ -24,11 +24,6 @@ class Index
include Singleton
- ## these two accessors should ONLY be used by single-threaded programs.
- ## otherwise you will have a naughty ferret on your hands.
- attr_reader :index
- alias ferret index
-
def initialize dir=BASE_DIR
@index_mutex = Monitor.new
@@ -151,7 +146,7 @@ EOS
if File.exists? dir
Redwood::log "loading index..."
@index_mutex.synchronize do
- @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
+ @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer, :id_field => 'message_id')
Redwood::log "loaded index of #{@index.size} messages"
end
else
@@ -171,7 +166,7 @@ EOS
field_infos.add_field :refs
field_infos.add_field :snippet, :index => :no, :term_vector => :no
field_infos.create_index dir
- @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer)
+ @index = Ferret::Index::Index.new(:path => dir, :analyzer => @analyzer, :id_field => 'message_id')
end
end
end
@@ -496,6 +491,22 @@ EOS
results.hits.map { |hit| hit.doc }
end
+ def each_docid opts={}
+ query = build_query opts
+ results = @index_mutex.synchronize { @index.search query, :limit => (opts[:limit] || :all) }
+ results.hits.map { |hit| yield hit.doc }
+ end
+
+ def each_message opts={}
+ each_docid opts do |docid|
+ yield build_message(docid)
+ end
+ end
+
+ def optimize
+ @index_mutex.synchronize { @index.optimize }
+ end
+
protected
class ParseError < StandardError; end
@@ -621,6 +632,8 @@ protected
query.add_query Ferret::Search::TermQuery.new("label", "spam"), :must_not unless opts[:load_spam] || labels.include?(:spam)
query.add_query Ferret::Search::TermQuery.new("label", "deleted"), :must_not unless opts[:load_deleted] || labels.include?(:deleted)
query.add_query Ferret::Search::TermQuery.new("label", "killed"), :must_not if opts[:skip_killed]
+
+ query.add_query Ferret::Search::TermQuery.new("source_id", opts[:source_id]), :must if opts[:source_id]
query
end
--
1.6.0.4
next reply other threads:[~2009-06-17 0:24 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-06-17 0:24 Rich Lane [this message]
2009-06-17 0:24 ` [sup-talk] [PATCH] index: consistent naming Rich Lane
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1245198299-18742-1-git-send-email-rlane@club.cc.cmu.edu \
--to=rlane@club.cc.cmu.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox