commit 283a8fe5a5f6c362452eb89e9e6f007ec51e822c
parent 562627ee9a78c54494c8eb97a4e6e322ade22132
Author: Dan Callaghan <djc@djc.id.au>
Date: Fri, 22 Apr 2022 17:52:23 +1000
accept UTF-8 header values
Fixes part of #602.
Diffstat:
5 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/Manifest.txt b/Manifest.txt
@@ -119,6 +119,7 @@ test/fixtures/non-ascii-header.eml
test/fixtures/rfc2047-header-encoding.eml
test/fixtures/simple-message.eml
test/fixtures/text-attachments-with-charset.eml
+test/fixtures/utf8-header.eml
test/fixtures/zimbra-quote-with-bottom-post.eml
test/gnupg_test_home/.gpg-v21-migrated
test/gnupg_test_home/gpg.conf
diff --git a/lib/sup/message.rb b/lib/sup/message.rb
@@ -1,6 +1,7 @@
# encoding: UTF-8
require 'time'
+require 'string-scrub' if /^2\.0\./ =~ RUBY_VERSION
module Redwood
@@ -71,9 +72,9 @@ class Message
return unless v
return v unless v.is_a? String
return unless v.size < MAX_HEADER_VALUE_SIZE # avoid regex blowup on spam
- d = v.dup
- d = d.transcode($encoding, 'ASCII')
- Rfc2047.decode_to $encoding, d
+ ## Header values should be either 7-bit with RFC2047-encoded words
+ ## or UTF-8 as per RFC6532. Replace any invalid high bytes with U+FFFD.
+ Rfc2047.decode_to $encoding, v.dup.force_encoding(Encoding::UTF_8).scrub
end
def parse_header encoded_header
diff --git a/sup.gemspec b/sup.gemspec
@@ -60,6 +60,7 @@ SUP: please note that our old mailing lists have been shut down,
s.add_runtime_dependency "chronic"
s.add_runtime_dependency "unicode", "~> 0.4.4"
s.add_runtime_dependency "unicode-display_width"
+ s.add_runtime_dependency "string-scrub" if /^2\.0\./ =~ RUBY_VERSION
s.add_development_dependency "bundler", ">= 1.3", "< 3"
s.add_development_dependency "rake"
diff --git a/test/fixtures/utf8-header.eml b/test/fixtures/utf8-header.eml
@@ -0,0 +1,17 @@
+Delivered-To: djc@djc.id.au
+Received: from orpheus.librarything.com (orpheus.librarything.com [74.201.105.9])
+ by djc.id.au (Postfix) with ESMTP id A0CCB20AAB99
+ for <djc@djc.id.au>; Sat, 23 Jan 2021 02:52:15 +1000 (AEST)
+Received: by orpheus.librarything.com (Postfix, from userid 0)
+ id 21B172C20F9; Fri, 22 Jan 2021 11:52:08 -0500 (EST)
+To: djc@djc.id.au
+Subject: LibraryThing: State of the Thing — January
+MIME-Version: 1.0
+Content-type: text/html; charset=iso-8859-1
+From: tim@librarything.com
+Reply-To: tim@librarything.com
+Message-Id: <20210122165208.21B172C20F9@orpheus.librarything.com>
+Date: Fri, 22 Jan 2021 11:52:08 -0500 (EST)
+Return-Path: <root@orpheus.librarything.com>
+
+<p>Some stuff</p>
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -254,9 +254,8 @@ class TestMessage < Minitest::Test
end
def test_nonascii_header
- ## Headers are supposed to be 7-bit ASCII, with non-ASCII characters encoded
- ## using RFC2047 header encoding. But spammers sometimes send high bytes in
- ## the headers. They will be replaced with U+FFFD REPLACEMENT CHARACTER.
+ ## Spammers sometimes send invalid high bytes in the headers.
+ ## They will be replaced with U+FFFD REPLACEMENT CHARACTER.
source = DummySource.new("sup-test://test_nonascii_header")
source.messages = [ fixture_path("non-ascii-header.eml") ]
source_info = 0
@@ -269,6 +268,19 @@ class TestMessage < Minitest::Test
assert_equal("spam \ufffd spam", sup_message.subj)
end
+ def test_utf8_header
+ ## UTF-8 is allowed in header values according to RFC6532.
+ source = DummySource.new("sup-test://test_utf8_header")
+ source.messages = [ fixture_path("utf8-header.eml") ]
+ source_info = 0
+
+ sup_message = Message.build_from_source(source, source_info)
+ sup_message.load_from_source!
+
+ assert_equal(Encoding::UTF_8, sup_message.subj.encoding)
+ assert_equal("LibraryThing: State of the Thing — January", sup_message.subj)
+ end
+
def test_nonascii_header_in_nested_message
source = DummySource.new("sup-test://test_nonascii_header_in_nested_message")
source.messages = [ fixture_path("non-ascii-header-in-nested-message.eml") ]