email: Fix RFC 2047 header decoding with line folding

srinivasreddy · srinivasreddy · commit a4e1f046bba6 · 2024-12-20T10:48:06.000+05:30
diff --git a/Lib/email/header.py b/Lib/email/header.py
@@ -102,6 +102,8 @@ def decode_header(header):
     for n, w in enumerate(words):
         if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
             droplist.append(n-1)
+            if n < len(words):
+                words[n] = (words[n][0].lstrip(), words[n][1], words[n][2])
     for d in reversed(droplist):
         del words[d]
 
diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py
@@ -1055,5 +1055,80 @@ def test_string_payload_with_multipart_content_type(self):
         self.assertEqual(list(attachments), [])
 
 
+class TestHeaderDecoding(unittest.TestCase):
+    def test_encoded_word_splitting(self):
+        # Test case with accented characters that forces line splitting
+        address = "Bérénice-Amélie Rosemonde Dûbois-Bénard <rose@example.com>"
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        # Test with default policy
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name,
+                        "Bérénice-Amélie Rosemonde Dûbois-Bénard")
+
+    def test_multiple_encoded_words(self):
+        # Test multiple encoded-words in sequence
+        headers = [
+            ("From", "André von Müller <andre@example.com>"),
+            ("To", "José García López <jose@example.com>"),
+            ("Subject", "Re: études à l'université"),
+        ]
+
+        message = EmailMessage()
+        for header, value in headers:
+            message[header] = value
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        for header, value in headers:
+            with self.subTest(header=header):
+                self.assertEqual(str(parsed[header]), value)
+
+    def test_long_encoded_words(self):
+        # Test very long names that force multiple encoded-word splits
+        long_name = "Maximilian-Friedrich von Württemberg-Höchstadt III"
+        address = f"{long_name} <max@example.com>"
+
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name, long_name)
+
+    def test_mixed_ascii_and_encoded(self):
+        # Test mixing ASCII and encoded-words
+        address = 'ACME Corp (アクメ) <info@example.com>'
+        message = EmailMessage()
+        message["From"] = address
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        self.assertEqual(str(parsed["From"].addresses[0]), address)
+        self.assertEqual(parsed["From"].addresses[0].display_name, 'ACME Corp (アクメ)')
+
+    def test_whitespace_handling(self):
+        # Test various whitespace scenarios between encoded-words
+        headers = [
+            ("From", "María  José <maria.jose@example.com>"),  # Double space
+            ("To", "André\tvon\tMüller <andre@example.com>"),  # Tabs
+            ("Cc", "José\n García <jose@example.com>"),  # Newline
+        ]
+
+        message = EmailMessage()
+        for header, value in headers:
+            message[header] = value
+        message_bytes = message.as_bytes()
+
+        parsed = message_from_bytes(message_bytes, policy=policy.default)
+        for header, value in headers:
+            with self.subTest(header=header):
+                self.assertEqual(str(parsed[header]), value)
+
+
 if __name__ == '__main__':
     unittest.main()