Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,14 @@ def test_syntaxerror_latin1(self):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)

def test_nonascii_coding(self):
# gh-63161: test non-ASCII coding
lines = (
'#coding=iso8859-15 €'.encode('iso8859-15'),
)
readline = self.get_readline(lines)
found, consumed_lines = tokenize.detect_encoding(readline)
self.assertEqual(found, "iso8859-15")

def test_utf8_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
Expand Down
24 changes: 14 additions & 10 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,20 +386,24 @@ def read_or_stop():
return b''

def find_cookie(line):
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
# gh-63161: Use surrogateescape error handler to escape potential
# non-ASCII characters after the coding declaration.
line_string = line.decode('utf-8', 'surrogateescape')

match = cookie_re.match(line_string)
if not match:
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line.decode('utf-8')
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
return None

encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :func:`tokenize.detect_encoding` for non-ASCII coding. Patch by Victor
Stinner.
Loading