handle UTF-7 in RFC2047 words

commit 9b4d42a2c864c61671d6aa6dc3b368e9b37466fe
parent 0f3e15ee8aee0477acebe0b0c221c3693d42e976
Author: Dan Callaghan <djc@djc.id.au>
Date:   Mon, 18 Apr 2022 12:05:31 +1000

handle UTF-7 in RFC2047 words

I received some spam with UTF-7 RFC2047 in the Subject header. Yet
another spam filter evasion technique, presumably.

Decode it for completeness rather than crashing:

    Encoding::ConverterNotFoundError: code converter not found (UTF-7 to UTF-8)

Diffstat:

M	lib/sup/rfc2047.rb	\|	6	++++++
M	lib/sup/util.rb	\|	16	++++++++++++++++
M	test/fixtures/rfc2047-header-encoding.eml	\|	1	+
M	test/test_message.rb	\|	3	++-

4 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/lib/sup/rfc2047.rb b/lib/sup/rfc2047.rb
@@ -16,6 +16,8 @@
 #
 # This file is distributed under the same terms as Ruby.
 
+require 'net/imap'
+
 module Rfc2047
   WORD = %r{=\?([!\#$%&'*+-/0-9A-Z\\^\`a-z{|}~]+)\?([BbQq])\?([!->@-~ ]+)\?=} # :nodoc: 'stupid ruby-mode
   WORDSEQ = %r{(#{WORD.source})\s+(?=#{WORD.source})}
@@ -50,6 +52,10 @@ module Rfc2047
         # WORD.
       end
 
+      # Handle UTF-7 specially because Ruby doesn't actually support it as
+      # a normal character encoding.
+      next text.decode_utf7.encode(target) if charset == 'UTF-7'
+
       begin
         text.force_encoding(charset).encode(target)
       rescue ArgumentError, Encoding::InvalidByteSequenceError
diff --git a/lib/sup/util.rb b/lib/sup/util.rb
@@ -375,6 +375,22 @@ class String
     self
   end
 
+  ## Decodes UTF-7 and returns the resulting decoded string as UTF-8.
+  ##
+  ## Ruby doesn't supply a UTF-7 encoding natively. There is
+  ## Net::IMAP::decode_utf7 which only handles the IMAP "modified UTF-7"
+  ## encoding. This implementation is inspired by that one but handles
+  ## standard UTF-7 shift characters and not the IMAP-specific variation.
+  def decode_utf7
+    gsub(/\+([^-]+)?-/) {
+      if $1
+        ($1 + "===").unpack("m")[0].encode(Encoding::UTF_8, Encoding::UTF_16BE)
+      else
+        "+"
+      end
+    }
+  end
+
   def normalize_whitespace
     gsub(/\t/, "    ").gsub(/\r/, "")
   end
diff --git a/test/fixtures/rfc2047-header-encoding.eml b/test/fixtures/rfc2047-header-encoding.eml
@@ -5,6 +5,7 @@ Subject:
  =?US-ASCII?q?Hans Martin Djupvik?= =?ISO-8859-1?q?,_Ingrid_B=F8?=
  =?KOI8-R?b?LCDp0snOwSDzycTP0s/XwQ?=
  =?UTF-16?b?//4sACAASgBlAHMAcABlAHIAIABCAGUAcgBnAA?=
+ =?UTF-7?b?LCBGcmlkYSBFbmcrQVBnLQ?=
  bad: =?UTF16?q?badcharsetname?= =?US-ASCII?b?/w?=
 
 The subject header contains various RFC2047 encoded words.
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -244,7 +244,8 @@ class TestMessage < Minitest::Test
     sup_message = Message.build_from_source(source, source_info)
     sup_message.load_from_source!
 
-    assert_equal("Hans Martin Djupvik, Ingrid Bø, Ирина Сидорова, Jesper Berg " +
+    assert_equal("Hans Martin Djupvik, Ingrid Bø, Ирина Сидорова, " +
+                 "Jesper Berg, Frida Engø " +
                  "bad: =?UTF16?q?badcharsetname?==?US-ASCII?b?/w?=",
                  sup_message.subj)
   end

sup.git