diff --git a/Lib/email/_header_value_parser.py b/Lib/email/_header_value_parser.py index ec2215a5e5f33c..0987e8ad42f730 100644 --- a/Lib/email/_header_value_parser.py +++ b/Lib/email/_header_value_parser.py @@ -863,6 +863,10 @@ class MessageID(MsgID): token_type = 'message-id' +class MessageIDList(TokenList): + token_type = "message-id-list" + + class InvalidMessageID(MessageID): token_type = 'invalid-message-id' @@ -2141,6 +2145,23 @@ def get_msg_id(value): return msg_id, value +def get_invalid_msg_id(value, endchars): + """ Read everything up to one of the chars in endchars, return InvalidMessageID + and rest of the value + + """ + invalid_msg_id = InvalidMessageID() + while value and value[0] not in endchars: + if value[0] in PHRASE_ENDS: + invalid_msg_id.append(ValueTerminal(value[0], + 'misplaced-special')) + value = value[1:] + else: + token, value = get_phrase(value) + invalid_msg_id.append(token) + return invalid_msg_id, value + + def parse_message_id(value): """message-id = "Message-ID:" msg-id CRLF """ @@ -2161,6 +2182,37 @@ def parse_message_id(value): return message_id +def parse_message_id_list(value): + """ in-reply-to = "In-Reply-To:" 1*msg-id CRLF + references = "References:" 1*msg-id CRLF + """ + + message_id_list = MessageIDList() + + # ignore initial CFWS + if value and value[0] in CFWS_LEADER: + _, value = get_cfws(value) + + # required at least one msg-id + if not value: + message_id_list.defects.append(errors.InvalidHeaderDefect( + "Empty message-id-list" + )) + return message_id_list + + while value: + try: + token, value = get_msg_id(value) + message_id_list.append(MessageID([token])) + except errors.HeaderParseError: + token, value = get_invalid_msg_id(value, "<") + message_id_list.append(token) + message_id_list.defects.append( + errors.InvalidHeaderDefect("Invalid msg-id: {!r}".format(str(token)))) + + + return message_id_list + # # XXX: As I begin to add additional header parsers, I'm realizing we probably # have two level of parser routines: the get_XXX methods that get a token in diff --git a/Lib/email/headerregistry.py b/Lib/email/headerregistry.py index 543141dc427ebe..b4a845cb873f02 100644 --- a/Lib/email/headerregistry.py +++ b/Lib/email/headerregistry.py @@ -534,6 +534,18 @@ def parse(cls, value, kwds): kwds['defects'].extend(parse_tree.all_defects) +class MessageIDListHeader: + + max_count = None + value_parser = staticmethod(parser.parse_message_id_list) + + @classmethod + def parse(cls, value, kwds): + kwds['parse_tree'] = parse_tree = cls.value_parser(value) + kwds['decoded'] = str(parse_tree) + kwds['defects'].extend(parse_tree.all_defects) + + # The header factory # _default_header_map = { @@ -557,6 +569,8 @@ def parse(cls, value, kwds): 'content-disposition': ContentDispositionHeader, 'content-transfer-encoding': ContentTransferEncodingHeader, 'message-id': MessageIDHeader, + 'references': MessageIDListHeader, + 'in-reply-to': MessageIDListHeader, } class HeaderRegistry: diff --git a/Lib/test/test_email/test__header_value_parser.py b/Lib/test/test_email/test__header_value_parser.py index 95224e19f67ce5..b297054dc4ddbb 100644 --- a/Lib/test/test_email/test__header_value_parser.py +++ b/Lib/test/test_email/test__header_value_parser.py @@ -2789,6 +2789,96 @@ def test_get_msg_id_ws_only_local(self): ) self.assertEqual(msg_id.token_type, 'msg-id') + def test_parse_message_id_list_with_one_id(self): + text = "<1@example.com>" + msg_id_list = self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [], + ) + self.assertEqual(msg_id_list.token_type, 'message-id-list') + + def test_parse_message_id_list(self): + text = "<1@example.com> <2@example.com> <3@example.com>" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [], + ) + + def test_parse_message_id_list_extra_white_spaces(self): + text = "<1@example.com> <2@example.com> <3@example.com>" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + "<1@example.com> <2@example.com> <3@example.com>", + [], + ) + + def test_parse_message_id_list_with_invalid_msg_id(self): + text = "<1@example.com> <2@example.com> abc <3@example.com>" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [errors.InvalidHeaderDefect], # "Invalid msg-id: 'abc '" + ) + + def test_parse_message_id_list_endswith_invalid_msg_id(self): + text = "<1@example.com> <2@example.com> abc" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [errors.InvalidHeaderDefect], # "Invalid msg-id: 'abc '" + ) + + def test_parse_message_id_list_with_no_value(self): + text = "" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [errors.InvalidHeaderDefect], # "Empty message-id-list" + ) + + def test_parse_message_id_list_with_invalid_id_only(self): + text = "abc" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [errors.InvalidHeaderDefect], # "Invalid msg-id: 'abc '" + ) + + def test_parse_message_id_list_startswith_invalid_id(self): + text = "abc <1@example.com> <2@example.com> abc" + self._test_parse_x( + parser.parse_message_id_list, + text, + text, + text, + [errors.InvalidHeaderDefect, errors.InvalidHeaderDefect], # "Invalid msg-id: 'abc '" + ) + + def test_parse_message_id_list_with_leading_whitespace(self): + text = " <1@example.com> <2@example.com>" + self._test_parse_x( + parser.parse_message_id_list, + text, + text.strip(), + text.strip(), + [], + ) @parameterize diff --git a/Lib/test/test_email/test_message.py b/Lib/test/test_email/test_message.py index 96979db27f3a21..969a8e3000bf2c 100644 --- a/Lib/test/test_email/test_message.py +++ b/Lib/test/test_email/test_message.py @@ -1031,6 +1031,81 @@ def test_get_body_malformed(self): # AttributeError: 'str' object has no attribute 'is_attachment' m.get_body() + def test_long_references_header(self): + msg = textwrap.dedent("""\ + Message-ID: + References: + From: Foo Bar + + No content + """) + m = self._str_msg(msg) + msg_bytes = (b'Message-ID:' + b' \n' + b'References:' + b' \n' + b'From: Foo Bar \n\nNo content\n') + self.assertEqual(m.as_bytes(), msg_bytes) + + def test_long_in_reply_to_header(self): + msg = textwrap.dedent("""\ + Message-ID: + In-Reply-To: + From: Foo Bar + + No content + """) + m = self._str_msg(msg) + msg_bytes = (b'Message-ID:' + b' \n' + b'In-Reply-To:' + b' \n' + b'From: Foo Bar \n\nNo content\n') + self.assertEqual(m.as_bytes(), msg_bytes) + + def test_msg_id_list_in_header(self): + msg_ids = " ".join([""] * 5) + msg = textwrap.dedent(f"""\ + Message-ID: + In-Reply-To: {msg_ids} + References: {msg_ids} + From: Foo Bar + + No content + """) + m = self._str_msg(msg) + msg_bytes = (b'Message-ID:' + b' \n' + b'In-Reply-To:' + b' \n' + b' \n' + b' \n' + b' \n' + b' \n' + b'References:' + b' \n' + b' \n' + b' \n' + b' \n' + b' \n' + b'From: Foo Bar \n\nNo content\n') + self.assertEqual(m.as_bytes(), msg_bytes) + + def test_no_references_value(self): + msg = textwrap.dedent("""\ + Message-ID: + References: + From: Foo Bar + + No content + """) + m = self._str_msg(msg) + msg_bytes = (b'Message-ID:' + b' \n' + b'References: \n' + b'From: Foo Bar \n\nNo content\n') + self.assertEqual(m.as_bytes(), msg_bytes) + class TestMIMEPart(TestEmailMessageBase, TestEmailBase): # Doing the full test run here may seem a bit redundant, since the two diff --git a/Misc/NEWS.d/next/Library/2025-01-08-13-16-37.gh-issue-100911.IzrEkV.rst b/Misc/NEWS.d/next/Library/2025-01-08-13-16-37.gh-issue-100911.IzrEkV.rst new file mode 100644 index 00000000000000..c05129da4f9cb1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-01-08-13-16-37.gh-issue-100911.IzrEkV.rst @@ -0,0 +1,2 @@ +Fixed email headers ``References`` and ``In-Reply-To`` being treated as +unstructured.