diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index d90a7659c4237c..5cdbe3151d8530 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1478,6 +1478,61 @@ def test_cookie_second_line_noncommented_first_line(self): expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected) + def test_first_non_utf8_coding_line(self): + lines = ( + b'#coding:iso-8859-15 \xa4\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_first_utf8_coding_line_error(self): + lines = ( + b'#coding:ascii \xc3\xa4\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_second_non_utf8_coding_line(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:iso-8859-15 \xa4\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_second_utf8_coding_line_error(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:ascii \xc3\xa4\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_non_utf8_shebang(self): + lines = ( + b'#!/home/\xa4/bin/python\n', + b'#coding:iso-8859-15\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso-8859-15') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_utf8_shebang_error(self): + lines = ( + b'#!/home/\xc3\xa4/bin/python\n', + b'#coding:ascii\n', + b'print(something)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + def test_cookie_second_line_empty_first_line(self): lines = ( b'\n', @@ -1531,6 +1586,28 @@ def test_double_coding_utf8(self): self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:1])) + def test_nul_in_first_coding_line(self): + lines = ( + b'#coding:iso8859-15\x00\n', + b'\n', + b'\n', + b'print(something)\n' + ) + with self.assertRaisesRegex(SyntaxError, + "source code cannot contain null bytes"): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_nul_in_second_coding_line(self): + lines = ( + b'#!/usr/bin/python\n', + b'#coding:iso8859-15\x00\n', + b'\n', + b'print(something)\n' + ) + with self.assertRaisesRegex(SyntaxError, + "source code cannot contain null bytes"): + tokenize.detect_encoding(self.get_readline(lines)) + def test_latin1_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7e71755068e1df..1f31258ce361c9 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -36,7 +36,7 @@ from token import EXACT_TOKEN_TYPES import _tokenize -cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) +cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) import token @@ -385,22 +385,23 @@ def read_or_stop(): except StopIteration: return b'' - def find_cookie(line): + def check(line, encoding): + # Check if the line matches the encoding. + if 0 in line: + raise SyntaxError("source code cannot contain null bytes") try: - # Decode as UTF-8. Either the line is an encoding declaration, - # in which case it should be pure ASCII, or it must be UTF-8 - # per default encoding. - line_string = line.decode('utf-8') + line.decode(encoding) except UnicodeDecodeError: msg = "invalid or missing encoding declaration" if filename is not None: msg = '{} for {!r}'.format(msg, filename) raise SyntaxError(msg) - match = cookie_re.match(line_string) + def find_cookie(line): + match = cookie_re.match(line) if not match: return None - encoding = _get_normal_name(match.group(1)) + encoding = _get_normal_name(match.group(1).decode()) try: codec = lookup(encoding) except LookupError: @@ -433,18 +434,23 @@ def find_cookie(line): encoding = find_cookie(first) if encoding: + check(first, encoding) return encoding, [first] if not blank_re.match(first): + check(first, default) return default, [first] second = read_or_stop() if not second: + check(first, default) return default, [first] encoding = find_cookie(second) if encoding: + check(first + second, encoding) return encoding, [first, second] + check(first + second, default) return default, [first, second] diff --git a/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst new file mode 100644 index 00000000000000..3daed20d099a8a --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst @@ -0,0 +1,3 @@ +Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments +if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8 +encoding. Detect null bytes in source code.