Skip to content

Commit fb7b944

Browse files
committed
gh-63161: Fix tokenize detect_encoding() for non-ASCII coding
1 parent 4fb338d commit fb7b944

File tree

2 files changed

+22
-10
lines changed

2 files changed

+22
-10
lines changed

Lib/test/test_tokenize.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1485,6 +1485,14 @@ def test_syntaxerror_latin1(self):
14851485
readline = self.get_readline(lines)
14861486
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
14871487

1488+
def test_nonascii_coding(self):
1489+
# gh-63161: test non-ASCII coding
1490+
lines = (
1491+
'#coding=iso8859-15 €'.encode('iso8859-15'),
1492+
)
1493+
readline = self.get_readline(lines)
1494+
found, consumed_lines = tokenize.detect_encoding(readline)
1495+
self.assertEqual(found, "iso8859-15")
14881496

14891497
def test_utf8_normalization(self):
14901498
# See get_normal_name() in Parser/tokenizer/helpers.c.

Lib/tokenize.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -386,20 +386,24 @@ def read_or_stop():
386386
return b''
387387

388388
def find_cookie(line):
389-
try:
390-
# Decode as UTF-8. Either the line is an encoding declaration,
391-
# in which case it should be pure ASCII, or it must be UTF-8
392-
# per default encoding.
393-
line_string = line.decode('utf-8')
394-
except UnicodeDecodeError:
395-
msg = "invalid or missing encoding declaration"
396-
if filename is not None:
397-
msg = '{} for {!r}'.format(msg, filename)
398-
raise SyntaxError(msg)
389+
# gh-63161: Use surrogateescape error handler to escape potential
390+
# non-ASCII characters after the coding declaration.
391+
line_string = line.decode('utf-8', 'surrogateescape')
399392

400393
match = cookie_re.match(line_string)
401394
if not match:
395+
try:
396+
# Decode as UTF-8. Either the line is an encoding declaration,
397+
# in which case it should be pure ASCII, or it must be UTF-8
398+
# per default encoding.
399+
line.decode('utf-8')
400+
except UnicodeDecodeError:
401+
msg = "invalid or missing encoding declaration"
402+
if filename is not None:
403+
msg = '{} for {!r}'.format(msg, filename)
404+
raise SyntaxError(msg)
402405
return None
406+
403407
encoding = _get_normal_name(match.group(1))
404408
try:
405409
codec = lookup(encoding)

0 commit comments

Comments
 (0)