Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1478,6 +1478,61 @@ def test_cookie_second_line_noncommented_first_line(self):
expected = [b"print('\xc2\xa3')\n"]
self.assertEqual(consumed_lines, expected)

def test_first_non_utf8_coding_line(self):
lines = (
b'#coding:iso-8859-15 \xa4\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso-8859-15')
self.assertEqual(consumed_lines, list(lines[:1]))

def test_first_utf8_coding_line_error(self):
lines = (
b'#coding:ascii \xc3\xa4\n',
b'print(something)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_second_non_utf8_coding_line(self):
lines = (
b'#!/usr/bin/python\n',
b'#coding:iso-8859-15 \xa4\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso-8859-15')
self.assertEqual(consumed_lines, list(lines[:2]))

def test_second_utf8_coding_line_error(self):
lines = (
b'#!/usr/bin/python\n',
b'#coding:ascii \xc3\xa4\n',
b'print(something)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_non_utf8_shebang(self):
lines = (
b'#!/home/\xa4/bin/python\n',
b'#coding:iso-8859-15\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso-8859-15')
self.assertEqual(consumed_lines, list(lines[:2]))

def test_utf8_shebang_error(self):
lines = (
b'#!/home/\xc3\xa4/bin/python\n',
b'#coding:ascii\n',
b'print(something)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_cookie_second_line_empty_first_line(self):
lines = (
b'\n',
Expand Down Expand Up @@ -1531,6 +1586,28 @@ def test_double_coding_utf8(self):
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:1]))

def test_nul_in_first_coding_line(self):
lines = (
b'#coding:iso8859-15\x00\n',
b'\n',
b'\n',
b'print(something)\n'
)
with self.assertRaisesRegex(SyntaxError,
"source code cannot contain null bytes"):
tokenize.detect_encoding(self.get_readline(lines))

def test_nul_in_second_coding_line(self):
lines = (
b'#!/usr/bin/python\n',
b'#coding:iso8859-15\x00\n',
b'\n',
b'print(something)\n'
)
with self.assertRaisesRegex(SyntaxError,
"source code cannot contain null bytes"):
tokenize.detect_encoding(self.get_readline(lines))

def test_latin1_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
Expand Down
22 changes: 14 additions & 8 deletions Lib/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from token import EXACT_TOKEN_TYPES
import _tokenize

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

import token
Expand Down Expand Up @@ -385,22 +385,23 @@ def read_or_stop():
except StopIteration:
return b''

def find_cookie(line):
def check(line, encoding):
# Check if the line matches the encoding.
if 0 in line:
raise SyntaxError("source code cannot contain null bytes")
try:
# Decode as UTF-8. Either the line is an encoding declaration,
# in which case it should be pure ASCII, or it must be UTF-8
# per default encoding.
line_string = line.decode('utf-8')
line.decode(encoding)
except UnicodeDecodeError:
msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)

match = cookie_re.match(line_string)
def find_cookie(line):
match = cookie_re.match(line)
if not match:
return None
encoding = _get_normal_name(match.group(1))
encoding = _get_normal_name(match.group(1).decode())
try:
codec = lookup(encoding)
except LookupError:
Expand Down Expand Up @@ -433,18 +434,23 @@ def find_cookie(line):

encoding = find_cookie(first)
if encoding:
check(first, encoding)
return encoding, [first]
if not blank_re.match(first):
check(first, default)
return default, [first]

second = read_or_stop()
if not second:
check(first, default)
return default, [first]

encoding = find_cookie(second)
if encoding:
check(first + second, encoding)
return encoding, [first, second]

check(first + second, default)
return default, [first, second]


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
encoding. Detect null bytes in source code.
Loading