fix: redundant WSP when parsed email

RanKKI · RanKKI · commit c02bc473f19d · 2024-10-24T15:52:30.000+11:00
diff --git a/Lib/email/policy.py b/Lib/email/policy.py
@@ -4,11 +4,11 @@
 
 import re
 import sys
-from email._policybase import Policy, Compat32, compat32, _extend_docstrings
-from email.utils import _has_surrogates
-from email.headerregistry import HeaderRegistry as HeaderRegistry
+from email._policybase import Compat32, Policy, _extend_docstrings, compat32
 from email.contentmanager import raw_data_manager
+from email.headerregistry import HeaderRegistry as HeaderRegistry
 from email.message import EmailMessage
+from email.utils import _has_surrogates
 
 __all__ = [
     'Compat32',
@@ -125,7 +125,27 @@ def header_source_parse(self, sourcelines):
 
         """
         name, value = sourcelines[0].split(':', 1)
-        value = value.lstrip(' \t') + ''.join(sourcelines[1:])
+
+        # Fixed: https://github.com/python/cpython/issues/124452
+        #
+        # Root cause: The function '_refold_parse_tree' in '_header_value_parse.py'.
+        # If there is no WSP, it can't figure out how to wrap the text.
+        # Therefore, it places the entire value directly after '\n', and because
+        # there is a WSP after '<HeaderName>:', the WSP will be moved to the front
+        # of the value according to RFC5322, section 2.2.3.
+        #
+        # However, the WSP is not part of the value; therefore, we must 
+        # remove it.
+
+        # Remove leading WSP in the first line only if there no value in the
+        # first line, and has values after that
+        remove_wsp = not value.strip() and len(sourcelines) > 1
+
+        value = value.lstrip(' \t')
+        if remove_wsp and sourcelines[1][0] in ' \t':
+            sourcelines[1] = sourcelines[1][1:]
+
+        value += ''.join(sourcelines[1:])
         return (name, value.rstrip('\r\n'))
 
     def header_store_parse(self, name, value):
diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py
@@ -1,6 +1,6 @@
-import unittest
 import textwrap
-from email import policy, message_from_string
+import unittest
+from email import message_from_bytes, message_from_string, policy
 from email.message import EmailMessage, MIMEPart
 from test.test_email import TestEmailBase, parameterize
 
@@ -957,6 +957,35 @@ def test_folding_with_utf8_encoding_8(self):
                          b'123456789 123456789 123456789 123456789 '
                          b'123456789-123456789\n 123456789 Hello '
                          b'=?utf-8?q?W=C3=B6rld!?= 123456789 123456789\n\n')
+        
+
+    def test_folding_with_short_nospace_1(self):
+        # bpo-36520
+        #
+        # Fold a line that contains a long whitespace after
+        # the fold point.
+
+        m = EmailMessage(policy.default)
+        m['Message-ID'] = '12345678912345678123456789123456789123456789'
+        parsed_msg = message_from_bytes(m.as_bytes(), policy=policy.default)
+        self.assertEqual(parsed_msg['Message-ID'], m['Message-ID'])
+    
+    def test_folding_with_long_nospace_1(self):
+        # Fixed: https://github.com/python/cpython/issues/124452
+        # 
+        # When the value is too long, it should be converted back 
+        # to its original form without any modifications. 
+
+        m = EmailMessage(policy.default)
+        m['Message-ID'] = '12345678912345678123456789123456789123456789'\
+                        '12345678912345678123456789123456789123456789'
+        self.assertEqual(m.as_bytes(),
+                         b'Message-ID:\n 12345678912345678123456789123456'\
+                         b'78912345678912345678912345678123456789123456789'\
+                         b'123456789\n\n')
+        parsed_msg = message_from_bytes(m.as_bytes(), policy=policy.default)
+        self.assertEqual(parsed_msg['Message-ID'], m['Message-ID'])
+
 
     def test_get_body_malformed(self):
         """test for bpo-42892"""