commit 9b4d42a2c864c61671d6aa6dc3b368e9b37466fe
parent 0f3e15ee8aee0477acebe0b0c221c3693d42e976
Author: Dan Callaghan <djc@djc.id.au>
Date: Mon, 18 Apr 2022 12:05:31 +1000
handle UTF-7 in RFC2047 words
I received some spam with UTF-7 RFC2047 in the Subject header. Yet
another spam filter evasion technique, presumably.
Decode it for completeness rather than crashing:
Encoding::ConverterNotFoundError: code converter not found (UTF-7 to UTF-8)
Diffstat:
4 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/lib/sup/rfc2047.rb b/lib/sup/rfc2047.rb
@@ -16,6 +16,8 @@
#
# This file is distributed under the same terms as Ruby.
+require 'net/imap'
+
module Rfc2047
WORD = %r{=\?([!\#$%&'*+-/0-9A-Z\\^\`a-z{|}~]+)\?([BbQq])\?([!->@-~ ]+)\?=} # :nodoc: 'stupid ruby-mode
WORDSEQ = %r{(#{WORD.source})\s+(?=#{WORD.source})}
@@ -50,6 +52,10 @@ module Rfc2047
# WORD.
end
+ # Handle UTF-7 specially because Ruby doesn't actually support it as
+ # a normal character encoding.
+ next text.decode_utf7.encode(target) if charset == 'UTF-7'
+
begin
text.force_encoding(charset).encode(target)
rescue ArgumentError, Encoding::InvalidByteSequenceError
diff --git a/lib/sup/util.rb b/lib/sup/util.rb
@@ -375,6 +375,22 @@ class String
self
end
+ ## Decodes UTF-7 and returns the resulting decoded string as UTF-8.
+ ##
+ ## Ruby doesn't supply a UTF-7 encoding natively. There is
+ ## Net::IMAP::decode_utf7 which only handles the IMAP "modified UTF-7"
+ ## encoding. This implementation is inspired by that one but handles
+ ## standard UTF-7 shift characters and not the IMAP-specific variation.
+ def decode_utf7
+ gsub(/\+([^-]+)?-/) {
+ if $1
+ ($1 + "===").unpack("m")[0].encode(Encoding::UTF_8, Encoding::UTF_16BE)
+ else
+ "+"
+ end
+ }
+ end
+
def normalize_whitespace
gsub(/\t/, " ").gsub(/\r/, "")
end
diff --git a/test/fixtures/rfc2047-header-encoding.eml b/test/fixtures/rfc2047-header-encoding.eml
@@ -5,6 +5,7 @@ Subject:
=?US-ASCII?q?Hans Martin Djupvik?= =?ISO-8859-1?q?,_Ingrid_B=F8?=
=?KOI8-R?b?LCDp0snOwSDzycTP0s/XwQ?=
=?UTF-16?b?//4sACAASgBlAHMAcABlAHIAIABCAGUAcgBnAA?=
+ =?UTF-7?b?LCBGcmlkYSBFbmcrQVBnLQ?=
bad: =?UTF16?q?badcharsetname?= =?US-ASCII?b?/w?=
The subject header contains various RFC2047 encoded words.
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -244,7 +244,8 @@ class TestMessage < Minitest::Test
sup_message = Message.build_from_source(source, source_info)
sup_message.load_from_source!
- assert_equal("Hans Martin Djupvik, Ingrid Bø, Ирина Сидорова, Jesper Berg " +
+ assert_equal("Hans Martin Djupvik, Ingrid Bø, Ирина Сидорова, " +
+ "Jesper Berg, Frida Engø " +
"bad: =?UTF16?q?badcharsetname?==?US-ASCII?b?/w?=",
sup_message.subj)
end