sup

A curses threads-with-tags style email client

sup.git

git clone https://supmua.dev/git/sup/
commit b1c9bb0e6792f0a2e7cf0ef347c2f080f1077a5b
parent cd8b84ae6610e6f5b62cd28a4d86c055893d20f6
Author: Dan Callaghan <djc@djc.id.au>
Date:   Sun, 21 Jun 2020 19:22:32 +1000

fix charset handling for quotable attachments

This patch fixes two related issues when displaying text/plain
attachments, or attachments which have been converted to text by the
mime-decode hook.

When the attachment was text/plain and it declared a charset, we were
trying to transcode it back *into* its declared charset which is
backwards and doesn't work. Probably nobody ever noticed this because
text/plain attachments in non-ASCII charsets would be rare.

Instead, in that case we should be using String#force_encoding to tell
the string what encoding it is really in (or supposed to be in) before
we manipulate it any further.

On the other hand, when we get a string back from the mime-decode hook
we *don't* want to apply the original charset to it. There is not
necessarily any relationship between the input bytes and what is
produced by the hook.

For a specific example, Outlook sends messages with
Content-Type: text/html; charset=us-ascii. The mime-decode hook may
run that through elinks, which will always produce output in the user's
locale. If we reinterpret the hook's output as ASCII, any non-ASCII
characters coming from elinks will be ruined.

Diffstat:
M lib/sup/message_chunks.rb | 4 ++--
A test/fixtures/text-attachments-with-charset.eml | 46 ++++++++++++++++++++++++++++++++++++++++++++++
M test/test_message.rb | 28 ++++++++++++++++++++++++++++
3 files changed, 76 insertions(+), 2 deletions(-)
diff --git a/lib/sup/message_chunks.rb b/lib/sup/message_chunks.rb
@@ -128,7 +128,7 @@ EOS
 
       text = case @content_type
       when /^text\/plain\b/
-        @raw_content
+        @raw_content.force_encoding(encoded_content.charset || 'US-ASCII')
       else
         HookManager.run "mime-decode", :content_type => @content_type,
                         :filename => lambda { write_to_disk },
@@ -138,7 +138,7 @@ EOS
 
       @lines = nil
       if text
-        text = text.transcode(encoded_content.charset || $encoding, text.encoding)
+        text = text.encode($encoding, :invalid => :replace, :undef => :replace)
         begin
           @lines = text.gsub("\r\n", "\n").gsub(/\t/, "        ").gsub(/\r/, "").split("\n")
         rescue Encoding::CompatibilityError
diff --git a/test/fixtures/text-attachments-with-charset.eml b/test/fixtures/text-attachments-with-charset.eml
@@ -0,0 +1,46 @@
+From: Fake Sender <fake_sender@example.invalid>
+To: Fake Receiver <fake_receiver@localhost>
+Date: Sun, 21 Jun 2020 06:25:49 -0000
+Subject: Attachments with charset
+MIME-Version: 1.0
+Content-Type: multipart/mixed; boundary="===============2385509127900810307=="
+
+--===============2385509127900810307==
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+
+This is the body.
+
+--===============2385509127900810307==
+Content-Type: text/plain; charset="us-ascii"
+Content-Transfer-Encoding: 7bit
+MIME-Version: 1.0
+Content-Disposition: attachment; filename="ascii.txt"
+
+This is ASCII
+
+--===============2385509127900810307==
+Content-Type: text/plain; charset="koi8-r"
+Content-Transfer-Encoding: quoted-printable
+MIME-Version: 1.0
+Content-Disposition: attachment; filename="cyrillic.txt"
+
+=F0=D2=C9=D7=C5=D4
+
+--===============2385509127900810307==
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: base64
+MIME-Version: 1.0
+Content-Disposition: attachment; filename="emoji.txt"
+
+8J+Yggo=
+
+--===============2385509127900810307==
+Content-Type: text/plain
+Content-Transfer-Encoding: quoted-printable
+Content-Disposition: attachment; filename="bad.txt"
+MIME-Version: 1.0
+
+Embedded=F0garbage
+--===============2385509127900810307==--
+
diff --git a/test/test_message.rb b/test/test_message.rb
@@ -181,6 +181,34 @@ class TestMessage < Minitest::Test
     # TODO: Add more asserts
   end
 
+  def test_text_attachment_decoding
+    message = fixture('text-attachments-with-charset.eml')
+
+    source = DummySource.new("sup-test://test_text_attachment_decoding")
+    source.messages = [ message ]
+    source_info = 0
+
+    sup_message = Message.build_from_source(source, source_info)
+    sup_message.load_from_source!
+
+    chunks = sup_message.load_from_source!
+    assert_equal(5, chunks.length)
+    assert(chunks[0].is_a? Redwood::Chunk::Text)
+    ## The first attachment declares charset=us-ascii
+    assert(chunks[1].is_a? Redwood::Chunk::Attachment)
+    assert_equal(["This is ASCII"], chunks[1].lines)
+    ## The second attachment declares charset=koi8-r and has some Cyrillic
+    assert(chunks[2].is_a? Redwood::Chunk::Attachment)
+    assert_equal(["\u041f\u0440\u0438\u0432\u0435\u0442"], chunks[2].lines)
+    ## The third attachment declares charset=utf-8 and has an emoji
+    assert(chunks[3].is_a? Redwood::Chunk::Attachment)
+    assert_equal(["\u{1f602}"], chunks[3].lines)
+    ## The fourth attachment declares no charset and has a non-ASCII byte,
+    ## which will be replaced with U+FFFD REPLACEMENT CHARACTER
+    assert(chunks[4].is_a? Redwood::Chunk::Attachment)
+    assert_equal(["Embedded\ufffdgarbage"], chunks[4].lines)
+  end
+
   def test_blank_header_lines
     message = fixture('blank-header-fields.eml')