Skip to content

Commit c8c0b20

Browse files
gh-63161: Fix tokenize.detect_encoding()
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error for non-UTF-8 encoding. * Detect null bytes in source code.
1 parent b2f5ad0 commit c8c0b20

File tree

3 files changed

+94
-8
lines changed

3 files changed

+94
-8
lines changed

Lib/test/test_tokenize.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,6 +1478,61 @@ def test_cookie_second_line_noncommented_first_line(self):
14781478
expected = [b"print('\xc2\xa3')\n"]
14791479
self.assertEqual(consumed_lines, expected)
14801480

1481+
def test_first_non_utf8_coding_line(self):
1482+
lines = (
1483+
b'#coding:iso-8859-15 \xa4\n',
1484+
b'print(something)\n'
1485+
)
1486+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1487+
self.assertEqual(encoding, 'iso-8859-15')
1488+
self.assertEqual(consumed_lines, list(lines[:1]))
1489+
1490+
def test_first_utf8_coding_line_error(self):
1491+
lines = (
1492+
b'#coding:ascii \xc3\xa4\n',
1493+
b'print(something)\n'
1494+
)
1495+
with self.assertRaises(SyntaxError):
1496+
tokenize.detect_encoding(self.get_readline(lines))
1497+
1498+
def test_second_non_utf8_coding_line(self):
1499+
lines = (
1500+
b'#!/usr/bin/python\n',
1501+
b'#coding:iso-8859-15 \xa4\n',
1502+
b'print(something)\n'
1503+
)
1504+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1505+
self.assertEqual(encoding, 'iso-8859-15')
1506+
self.assertEqual(consumed_lines, list(lines[:2]))
1507+
1508+
def test_second_utf8_coding_line_error(self):
1509+
lines = (
1510+
b'#!/usr/bin/python\n',
1511+
b'#coding:ascii \xc3\xa4\n',
1512+
b'print(something)\n'
1513+
)
1514+
with self.assertRaises(SyntaxError):
1515+
tokenize.detect_encoding(self.get_readline(lines))
1516+
1517+
def test_non_utf8_shebang(self):
1518+
lines = (
1519+
b'#!/home/\xa4/bin/python\n',
1520+
b'#coding:iso-8859-15\n',
1521+
b'print(something)\n'
1522+
)
1523+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1524+
self.assertEqual(encoding, 'iso-8859-15')
1525+
self.assertEqual(consumed_lines, list(lines[:2]))
1526+
1527+
def test_utf8_shebang_error(self):
1528+
lines = (
1529+
b'#!/home/\xc3\xa4/bin/python\n',
1530+
b'#coding:ascii\n',
1531+
b'print(something)\n'
1532+
)
1533+
with self.assertRaises(SyntaxError):
1534+
tokenize.detect_encoding(self.get_readline(lines))
1535+
14811536
def test_cookie_second_line_empty_first_line(self):
14821537
lines = (
14831538
b'\n',
@@ -1531,6 +1586,28 @@ def test_double_coding_utf8(self):
15311586
self.assertEqual(encoding, 'utf-8')
15321587
self.assertEqual(consumed_lines, list(lines[:1]))
15331588

1589+
def test_nul_in_first_coding_line(self):
1590+
lines = (
1591+
b'#coding:iso8859-15\x00\n',
1592+
b'\n',
1593+
b'\n',
1594+
b'print(something)\n'
1595+
)
1596+
with self.assertRaisesRegex(SyntaxError,
1597+
"source code cannot contain null bytes"):
1598+
tokenize.detect_encoding(self.get_readline(lines))
1599+
1600+
def test_nul_in_second_coding_line(self):
1601+
lines = (
1602+
b'#!/usr/bin/python\n',
1603+
b'#coding:iso8859-15\x00\n',
1604+
b'\n',
1605+
b'print(something)\n'
1606+
)
1607+
with self.assertRaisesRegex(SyntaxError,
1608+
"source code cannot contain null bytes"):
1609+
tokenize.detect_encoding(self.get_readline(lines))
1610+
15341611
def test_latin1_normalization(self):
15351612
# See get_normal_name() in Parser/tokenizer/helpers.c.
15361613
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",

Lib/tokenize.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from token import EXACT_TOKEN_TYPES
3737
import _tokenize
3838

39-
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
39+
cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
4040
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
4141

4242
import token
@@ -385,22 +385,23 @@ def read_or_stop():
385385
except StopIteration:
386386
return b''
387387

388-
def find_cookie(line):
388+
def check(line, encoding):
389+
# Check if the line matches the encoding.
390+
if 0 in line:
391+
raise SyntaxError("source code cannot contain null bytes")
389392
try:
390-
# Decode as UTF-8. Either the line is an encoding declaration,
391-
# in which case it should be pure ASCII, or it must be UTF-8
392-
# per default encoding.
393-
line_string = line.decode('utf-8')
393+
line.decode(encoding)
394394
except UnicodeDecodeError:
395395
msg = "invalid or missing encoding declaration"
396396
if filename is not None:
397397
msg = '{} for {!r}'.format(msg, filename)
398398
raise SyntaxError(msg)
399399

400-
match = cookie_re.match(line_string)
400+
def find_cookie(line):
401+
match = cookie_re.match(line)
401402
if not match:
402403
return None
403-
encoding = _get_normal_name(match.group(1))
404+
encoding = _get_normal_name(match.group(1).decode())
404405
try:
405406
codec = lookup(encoding)
406407
except LookupError:
@@ -433,18 +434,23 @@ def find_cookie(line):
433434

434435
encoding = find_cookie(first)
435436
if encoding:
437+
check(first, encoding)
436438
return encoding, [first]
437439
if not blank_re.match(first):
440+
check(first, default)
438441
return default, [first]
439442

440443
second = read_or_stop()
441444
if not second:
445+
check(first, default)
442446
return default, [first]
443447

444448
encoding = find_cookie(second)
445449
if encoding:
450+
check(first + second, encoding)
446451
return encoding, [first, second]
447452

453+
check(first + second, default)
448454
return default, [first, second]
449455

450456

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
2+
if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
3+
encoding. Detect null bytes in source code.

0 commit comments

Comments
 (0)