From 6595af0b55d52d1f68562fbdd0f1b23dfee34039 Mon Sep 17 00:00:00 2001 From: Horacio Sanson Date: Wed, 4 May 2011 10:34:48 +0900 Subject: [PATCH 2/2] Add MeCab japanese text analyzer. Japanese text has no white space separation causing the Whistelpig tokenizer to fail. This patch processes the email indexable text and search queries with MeCab before passing them to Whistelpig. --- bin/heliotrope-server | 3 ++- lib/heliotrope/message.rb | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/heliotrope-server b/bin/heliotrope-server index ed9c3be..f3bd5d4 100644 --- a/bin/heliotrope-server +++ b/bin/heliotrope-server @@ -67,6 +67,7 @@ class HeliotropeServer < Sinatra::Base end.to_json end + require "MeCab" def get_query_from_params ## work around a rack (?) bug where quotes are omitted in queries like "hello bob" query = if env["rack.request.query_string"] =~ /\bq=(.+?)(&|$)/ @@ -76,7 +77,7 @@ class HeliotropeServer < Sinatra::Base end raise RequestError, "need a query" unless query - query + MeCab::Tagger.new("-Owakati").parse(query).force_encoding("UTF-8") end def get_search_results diff --git a/lib/heliotrope/message.rb b/lib/heliotrope/message.rb index b48329b..e61d8bd 100644 --- a/lib/heliotrope/message.rb +++ b/lib/heliotrope/message.rb @@ -76,6 +76,7 @@ class Message def indirect_recipients; cc + bcc end def recipients; direct_recipients + indirect_recipients end + require "MeCab" def indexable_text @indexable_text ||= begin v = ([from.indexable_text] + @@ -90,8 +91,8 @@ class Message end ).flatten.compact.join(" ") - v.gsub(/\s+[\W\d_]+(\s|$)/, " "). # drop funny tokens - gsub(/\s+/, " ") + MeCab::Tagger.new("-Owakati").parse(v) # Tokenize Japanese Text + .gsub(/\s+/, " ") end end -- 1.7.4.1