accept UTF-8 header values

commit 283a8fe5a5f6c362452eb89e9e6f007ec51e822c
parent 562627ee9a78c54494c8eb97a4e6e322ade22132
Author: Dan Callaghan <djc@djc.id.au>
Date:   Fri, 22 Apr 2022 17:52:23 +1000

accept UTF-8 header values

Fixes part of #602.

Diffstat:

M	Manifest.txt	\|	1	+
M	lib/sup/message.rb	\|	7	++++---
M	sup.gemspec	\|	1	+
A	test/fixtures/utf8-header.eml	\|	17	+++++++++++++++++
M	test/test_message.rb	\|	18	+++++++++++++++---

5 files changed, 38 insertions(+), 6 deletions(-)
diff --git a/Manifest.txt b/Manifest.txt
@@ -119,6 +119,7 @@ test/fixtures/non-ascii-header.eml
 test/fixtures/rfc2047-header-encoding.eml
 test/fixtures/simple-message.eml
 test/fixtures/text-attachments-with-charset.eml
+test/fixtures/utf8-header.eml
 test/fixtures/zimbra-quote-with-bottom-post.eml
 test/gnupg_test_home/.gpg-v21-migrated
 test/gnupg_test_home/gpg.conf
diff --git a/lib/sup/message.rb b/lib/sup/message.rb
@@ -1,6 +1,7 @@
 # encoding: UTF-8
 
 require 'time'
+require 'string-scrub' if /^2\.0\./ =~ RUBY_VERSION
 
 module Redwood
 
@@ -71,9 +72,9 @@ class Message
     return unless v
     return v unless v.is_a? String
     return unless v.size < MAX_HEADER_VALUE_SIZE # avoid regex blowup on spam
-    d = v.dup
-    d = d.transcode($encoding, 'ASCII')
-    Rfc2047.decode_to $encoding, d
+    ## Header values should be either 7-bit with RFC2047-encoded words
+    ## or UTF-8 as per RFC6532. Replace any invalid high bytes with U+FFFD.
+    Rfc2047.decode_to $encoding, v.dup.force_encoding(Encoding::UTF_8).scrub
   end
 
   def parse_header encoded_header
diff --git a/sup.gemspec b/sup.gemspec
@@ -60,6 +60,7 @@ SUP: please note that our old mailing lists have been shut down,
   s.add_runtime_dependency "chronic"
   s.add_runtime_dependency "unicode", "~> 0.4.4"
   s.add_runtime_dependency "unicode-display_width"
+  s.add_runtime_dependency "string-scrub" if /^2\.0\./ =~ RUBY_VERSION
 
   s.add_development_dependency "bundler", ">= 1.3", "< 3"
   s.add_development_dependency "rake"
diff --git a/test/fixtures/utf8-header.eml b/test/fixtures/utf8-header.eml
@@ -0,0 +1,17 @@
+Delivered-To: djc@djc.id.au
+Received: from orpheus.librarything.com (orpheus.librarything.com [74.201.105.9])
+    by djc.id.au (Postfix) with ESMTP id A0CCB20AAB99
+    for <djc@djc.id.au>; Sat, 23 Jan 2021 02:52:15 +1000 (AEST)
+Received: by orpheus.librarything.com (Postfix, from userid 0)
+    id 21B172C20F9; Fri, 22 Jan 2021 11:52:08 -0500 (EST)
+To: djc@djc.id.au
+Subject: LibraryThing: State of the Thing — January
+MIME-Version: 1.0
+Content-type: text/html; charset=iso-8859-1
+From: tim@librarything.com
+Reply-To: tim@librarything.com
+Message-Id: <20210122165208.21B172C20F9@orpheus.librarything.com>
+Date: Fri, 22 Jan 2021 11:52:08 -0500 (EST)
+Return-Path: <root@orpheus.librarything.com>
+
+<p>Some stuff</p>
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -254,9 +254,8 @@ class TestMessage < Minitest::Test
   end
 
   def test_nonascii_header
-    ## Headers are supposed to be 7-bit ASCII, with non-ASCII characters encoded
-    ## using RFC2047 header encoding. But spammers sometimes send high bytes in
-    ## the headers. They will be replaced with U+FFFD REPLACEMENT CHARACTER.
+    ## Spammers sometimes send invalid high bytes in the headers.
+    ## They will be replaced with U+FFFD REPLACEMENT CHARACTER.
     source = DummySource.new("sup-test://test_nonascii_header")
     source.messages = [ fixture_path("non-ascii-header.eml") ]
     source_info = 0
@@ -269,6 +268,19 @@ class TestMessage < Minitest::Test
     assert_equal("spam \ufffd spam", sup_message.subj)
   end
 
+  def test_utf8_header
+    ## UTF-8 is allowed in header values according to RFC6532.
+    source = DummySource.new("sup-test://test_utf8_header")
+    source.messages = [ fixture_path("utf8-header.eml") ]
+    source_info = 0
+
+    sup_message = Message.build_from_source(source, source_info)
+    sup_message.load_from_source!
+
+    assert_equal(Encoding::UTF_8, sup_message.subj.encoding)
+    assert_equal("LibraryThing: State of the Thing — January", sup_message.subj)
+  end
+
   def test_nonascii_header_in_nested_message
     source = DummySource.new("sup-test://test_nonascii_header_in_nested_message")
     source.messages = [ fixture_path("non-ascii-header-in-nested-message.eml") ]

sup.git