Skip to content

Commit ef61984

Browse files
committed
gh-128110: Fix rfc2047 handling in email parser address headers
RFC 2047 Section 6.2 requires that "any 'linear-white-space' that separates a pair of adjacent 'encoded-word's is ignored." The modern header value parser correctly implements that for unstructured headers, but had missed a case in structured headers. This could cause a parsed address header to include extraneous spaces in a display-name. Fixed in get_atom() by converting a trailing CFWSList token after an encoded-word to an EWWhiteSpaceTerminal if another encoded-word follows. Deliberately left similar code in get_dotatom() unmodified. A dotatom can only appear within an addr-spec. RFC 2047 Section 5 prohibits use of an encoded-word in any portion of an addr-spec, so its appearance in a dotatom is invalid. Adding (and testing) special white-space handling in an invalid dotatom seems an unnecessary complication.
1 parent c9932a9 commit ef61984

File tree

3 files changed

+106
-0
lines changed

3 files changed

+106
-0
lines changed

Lib/email/_header_value_parser.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1340,6 +1340,18 @@ def get_atom(value):
13401340
atom.append(token)
13411341
if value and value[0] in CFWS_LEADER:
13421342
token, value = get_cfws(value)
1343+
# Peek ahead to ignore linear-white-space between adjacent encoded-words.
1344+
if (
1345+
atom[-1].token_type == 'encoded-word'
1346+
and value.startswith('=?')
1347+
and all(ws.token_type == 'fws' for ws in token) # not comments
1348+
):
1349+
try:
1350+
get_encoded_word(value)
1351+
except errors.HeaderParseError:
1352+
pass
1353+
else:
1354+
token = EWWhiteSpaceTerminal(token, 'fws')
13431355
atom.append(token)
13441356
return atom, value
13451357

Lib/test/test_email/test__header_value_parser.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,79 @@ def get_phrase_cfws_only_raises(self):
10391039
with self.assertRaises(errors.HeaderParseError):
10401040
parser.get_phrase(' (foo) ')
10411041

1042+
def test_get_phrase_adjacent_ew(self):
1043+
# In structured headers, the requirement to ignore linear-white-space
1044+
# between adjacent encoded-words is actually implemented by get_atom.
1045+
# But it's easier to see the results by testing get_phrase.
1046+
self._test_get_x(parser.get_phrase, '=?ascii?q?Joi?= \t =?ascii?q?ned?=', 'Joined', 'Joined', [], '')
1047+
1048+
def test_get_phrase_adjacent_ew_different_encodings(self):
1049+
self._test_get_x(
1050+
parser.get_phrase,
1051+
'=?utf-8?q?B=C3=A9r?= =?iso-8859-1?q?=E9nice?=', 'Bérénice', 'Bérénice', [], ''
1052+
)
1053+
1054+
def test_get_phrase_adjacent_ew_encoded_spaces(self):
1055+
self._test_get_x(
1056+
parser.get_phrase,
1057+
'=?ascii?q?Encoded?= =?ascii?q?_spaces_?= =?ascii?q?preserved?=',
1058+
'Encoded spaces preserved',
1059+
'Encoded spaces preserved',
1060+
[],
1061+
''
1062+
)
1063+
1064+
def test_get_phrase_adjacent_ew_comment_is_not_linear_white_space(self):
1065+
self._test_get_x(
1066+
parser.get_phrase,
1067+
'=?ascii?q?Comment?= (is not) =?ascii?q?linear-white-space?=',
1068+
'Comment (is not) linear-white-space',
1069+
'Comment linear-white-space',
1070+
[],
1071+
'',
1072+
comments=['is not'],
1073+
)
1074+
1075+
def test_get_phrase_adjacent_ew_no_error_on_defects(self):
1076+
self._test_get_x(
1077+
parser.get_phrase,
1078+
'=?ascii?q?Def?= =?ascii?q?ect still joins?=',
1079+
'Defect still joins',
1080+
'Defect still joins',
1081+
[errors.InvalidHeaderDefect], # whitespace inside encoded word
1082+
''
1083+
)
1084+
1085+
def test_get_phrase_adjacent_ew_ignore_non_ew(self):
1086+
self._test_get_x(
1087+
parser.get_phrase,
1088+
'=?ascii?q?No?= =?join?= for non-ew',
1089+
'No =?join?= for non-ew',
1090+
'No =?join?= for non-ew',
1091+
[],
1092+
''
1093+
)
1094+
1095+
def test_get_phrase_adjacent_ew_ignore_invalid_ew(self):
1096+
self._test_get_x(
1097+
parser.get_phrase,
1098+
'=?ascii?q?No?= =?ascii?rot13?wbva= for invalid ew',
1099+
'No =?ascii?rot13?wbva= for invalid ew',
1100+
'No =?ascii?rot13?wbva= for invalid ew',
1101+
[],
1102+
''
1103+
)
1104+
1105+
def test_get_phrase_adjacent_ew_missing_space(self):
1106+
self._test_get_x(
1107+
parser.get_phrase,
1108+
'=?ascii?q?Joi?==?ascii?q?ned?=',
1109+
'Joined',
1110+
'Joined',
1111+
[errors.InvalidHeaderDefect], # missing trailing whitespace
1112+
''
1113+
)
1114+
10421115
# get_local_part
10431116

10441117
def test_get_local_part_simple(self):
@@ -2365,6 +2438,22 @@ def test_get_address_rfc2047_display_name(self):
23652438
self.assertEqual(address[0].token_type,
23662439
'mailbox')
23672440

2441+
def test_get_address_rfc2047_display_name_adjacent_ews(self):
2442+
address = self._test_get_x(parser.get_address,
2443+
'=?utf-8?q?B=C3=A9r?= =?utf-8?q?=C3=A9nice?= <[email protected]>',
2444+
'Bérénice <[email protected]>',
2445+
'Bérénice <[email protected]>',
2446+
[],
2447+
'')
2448+
self.assertEqual(address.token_type, 'address')
2449+
self.assertEqual(len(address.mailboxes), 1)
2450+
self.assertEqual(address.mailboxes,
2451+
address.all_mailboxes)
2452+
self.assertEqual(address.mailboxes[0].display_name,
2453+
'Bérénice')
2454+
self.assertEqual(address[0].token_type,
2455+
'mailbox')
2456+
23682457
def test_get_address_empty_group(self):
23692458
address = self._test_get_x(parser.get_address,
23702459
'Monty Python:;',
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix bug in the parsing of email address headers that could result in
2+
extraneous spaces in the decoded text when using a modern email policy.
3+
Space between pairs of adjacent rfc2047 encoded-words is now ignored, per
4+
rfc2047 section 6.2 (and consistent with existing parsing of unstructured
5+
headers like *Subject*).

0 commit comments

Comments
 (0)