Skip to content

Commit 38d4b43

Browse files
gh-63161: Fix tokenize.detect_encoding() (GH-139446)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error for non-UTF-8 encoding. * Detect null bytes in source code.
1 parent 3222ea0 commit 38d4b43

File tree

3 files changed

+94
-8
lines changed

3 files changed

+94
-8
lines changed

Lib/test/test_tokenize.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1495,6 +1495,61 @@ def test_cookie_second_line_noncommented_first_line(self):
14951495
expected = [b"print('\xc2\xa3')\n"]
14961496
self.assertEqual(consumed_lines, expected)
14971497

1498+
def test_first_non_utf8_coding_line(self):
1499+
lines = (
1500+
b'#coding:iso-8859-15 \xa4\n',
1501+
b'print(something)\n'
1502+
)
1503+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1504+
self.assertEqual(encoding, 'iso-8859-15')
1505+
self.assertEqual(consumed_lines, list(lines[:1]))
1506+
1507+
def test_first_utf8_coding_line_error(self):
1508+
lines = (
1509+
b'#coding:ascii \xc3\xa4\n',
1510+
b'print(something)\n'
1511+
)
1512+
with self.assertRaises(SyntaxError):
1513+
tokenize.detect_encoding(self.get_readline(lines))
1514+
1515+
def test_second_non_utf8_coding_line(self):
1516+
lines = (
1517+
b'#!/usr/bin/python\n',
1518+
b'#coding:iso-8859-15 \xa4\n',
1519+
b'print(something)\n'
1520+
)
1521+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1522+
self.assertEqual(encoding, 'iso-8859-15')
1523+
self.assertEqual(consumed_lines, list(lines[:2]))
1524+
1525+
def test_second_utf8_coding_line_error(self):
1526+
lines = (
1527+
b'#!/usr/bin/python\n',
1528+
b'#coding:ascii \xc3\xa4\n',
1529+
b'print(something)\n'
1530+
)
1531+
with self.assertRaises(SyntaxError):
1532+
tokenize.detect_encoding(self.get_readline(lines))
1533+
1534+
def test_non_utf8_shebang(self):
1535+
lines = (
1536+
b'#!/home/\xa4/bin/python\n',
1537+
b'#coding:iso-8859-15\n',
1538+
b'print(something)\n'
1539+
)
1540+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1541+
self.assertEqual(encoding, 'iso-8859-15')
1542+
self.assertEqual(consumed_lines, list(lines[:2]))
1543+
1544+
def test_utf8_shebang_error(self):
1545+
lines = (
1546+
b'#!/home/\xc3\xa4/bin/python\n',
1547+
b'#coding:ascii\n',
1548+
b'print(something)\n'
1549+
)
1550+
with self.assertRaises(SyntaxError):
1551+
tokenize.detect_encoding(self.get_readline(lines))
1552+
14981553
def test_cookie_second_line_empty_first_line(self):
14991554
lines = (
15001555
b'\n',
@@ -1548,6 +1603,28 @@ def test_double_coding_utf8(self):
15481603
self.assertEqual(encoding, 'utf-8')
15491604
self.assertEqual(consumed_lines, list(lines[:1]))
15501605

1606+
def test_nul_in_first_coding_line(self):
1607+
lines = (
1608+
b'#coding:iso8859-15\x00\n',
1609+
b'\n',
1610+
b'\n',
1611+
b'print(something)\n'
1612+
)
1613+
with self.assertRaisesRegex(SyntaxError,
1614+
"source code cannot contain null bytes"):
1615+
tokenize.detect_encoding(self.get_readline(lines))
1616+
1617+
def test_nul_in_second_coding_line(self):
1618+
lines = (
1619+
b'#!/usr/bin/python\n',
1620+
b'#coding:iso8859-15\x00\n',
1621+
b'\n',
1622+
b'print(something)\n'
1623+
)
1624+
with self.assertRaisesRegex(SyntaxError,
1625+
"source code cannot contain null bytes"):
1626+
tokenize.detect_encoding(self.get_readline(lines))
1627+
15511628
def test_latin1_normalization(self):
15521629
# See get_normal_name() in Parser/tokenizer/helpers.c.
15531630
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

Lib/tokenize.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from token import EXACT_TOKEN_TYPES
3737
import _tokenize
3838

39-
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
39+
cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
4040
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
4141

4242
import token
@@ -385,22 +385,23 @@ def read_or_stop():
385385
except StopIteration:
386386
return b''
387387

388-
def find_cookie(line):
388+
def check(line, encoding):
389+
# Check if the line matches the encoding.
390+
if 0 in line:
391+
raise SyntaxError("source code cannot contain null bytes")
389392
try:
390-
# Decode as UTF-8. Either the line is an encoding declaration,
391-
# in which case it should be pure ASCII, or it must be UTF-8
392-
# per default encoding.
393-
line_string = line.decode('utf-8')
393+
line.decode(encoding)
394394
except UnicodeDecodeError:
395395
msg = "invalid or missing encoding declaration"
396396
if filename is not None:
397397
msg = '{} for {!r}'.format(msg, filename)
398398
raise SyntaxError(msg)
399399

400-
match = cookie_re.match(line_string)
400+
def find_cookie(line):
401+
match = cookie_re.match(line)
401402
if not match:
402403
return None
403-
encoding = _get_normal_name(match.group(1))
404+
encoding = _get_normal_name(match.group(1).decode())
404405
try:
405406
codec = lookup(encoding)
406407
except LookupError:
@@ -433,18 +434,23 @@ def find_cookie(line):
433434

434435
encoding = find_cookie(first)
435436
if encoding:
437+
check(first, encoding)
436438
return encoding, [first]
437439
if not blank_re.match(first):
440+
check(first, default)
438441
return default, [first]
439442

440443
second = read_or_stop()
441444
if not second:
445+
check(first, default)
442446
return default, [first]
443447

444448
encoding = find_cookie(second)
445449
if encoding:
450+
check(first + second, encoding)
446451
return encoding, [first, second]
447452

453+
check(first + second, default)
448454
return default, [first, second]
449455

450456

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
2+
if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
3+
encoding. Detect null bytes in source code.

0 commit comments

Comments
 (0)