Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 95 additions & 7 deletions Lib/test/test_source_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
os.unlink(TESTFN)


BUFSIZ = 2**13

class AbstractSourceEncodingTest:

def test_default_coding(self):
Expand All @@ -184,14 +186,20 @@ def test_first_coding_line(self):
self.check_script_output(src, br"'\xc3\u20ac'")

def test_second_coding_line(self):
src = (b'#\n'
src = (b'#!/usr/bin/python\n'
b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_second_coding_line_empty_first_line(self):
src = (b'\n'
b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_third_coding_line(self):
# Only first two lines are tested for a magic comment.
src = (b'#\n'
src = (b'#!/usr/bin/python\n'
b'#\n'
b'#coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
Expand All @@ -209,13 +217,52 @@ def test_double_coding_same_line(self):
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_double_coding_utf8(self):
src = (b'#coding:utf-8\n'
b'#coding:latin1\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_long_first_coding_line(self):
src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_long_second_coding_line(self):
src = (b'#!/usr/bin/python\n'
b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_long_coding_line(self):
src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_long_coding_name(self):
src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\xa4'")

def test_long_first_utf8_line(self):
src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')

def test_long_second_utf8_line(self):
src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')
src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
self.check_script_output(src, b'')

def test_first_non_utf8_coding_line(self):
src = (b'#coding:iso-8859-15 \xa4\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")

def test_second_non_utf8_coding_line(self):
src = (b'\n'
src = (b'#!/usr/bin/python\n'
b'#coding:iso-8859-15 \xa4\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xc3\u20ac'")
Expand All @@ -224,27 +271,56 @@ def test_utf8_bom(self):
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_utf8_bom_utf8_comments(self):
src = (b'\xef\xbb\xbf#\xc3\xa4\n'
b'#\xc3\xa4\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_utf8_bom_and_utf8_coding_line(self):
src = (b'\xef\xbb\xbf#coding:utf-8\n'
b'print(ascii("\xc3\xa4"))\n')
self.check_script_output(src, br"'\xe4'")

def test_utf8_non_utf8_comment_line_error(self):
src = (b'#coding: utf8\n'
b'#\n'
b'#\xa4\n'
b'raise RuntimeError\n')
self.check_script_error(src,
br"'utf-8' codec can't decode byte|"
br"encoding problem: utf8")

def test_crlf(self):
src = (b'print(ascii("""\r\n"""))\n')
out = self.check_script_output(src, br"'\n'")
self.check_script_output(src, br"'\n'")

def test_crcrlf(self):
src = (b'print(ascii("""\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n'")
self.check_script_output(src, br"'\n\n'")

def test_crcrcrlf(self):
src = (b'print(ascii("""\r\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n\n'")
self.check_script_output(src, br"'\n\n\n'")

def test_crcrcrlf2(self):
src = (b'#coding:iso-8859-1\n'
b'print(ascii("""\r\r\r\n"""))\n')
out = self.check_script_output(src, br"'\n\n\n'")
self.check_script_output(src, br"'\n\n\n'")

def test_nul_in_first_coding_line(self):
src = (b'#coding:iso8859-15\x00\n'
b'\n'
b'\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"source code (string )?cannot contain null bytes")

def test_nul_in_second_coding_line(self):
src = (b'#!/usr/bin/python\n'
b'#coding:iso8859-15\x00\n'
b'\n'
b'raise RuntimeError\n')
self.check_script_error(src, br"source code (string )?cannot contain null bytes")


class UTF8ValidatorTest(unittest.TestCase):
Expand Down Expand Up @@ -324,6 +400,10 @@ def check_script_output(self, src, expected):
out = stdout.getvalue().encode('latin1')
self.assertEqual(out.rstrip(), expected)

def check_script_error(self, src, expected):
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
exec(src)


class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):

Expand All @@ -335,6 +415,14 @@ def check_script_output(self, src, expected):
res = script_helper.assert_python_ok(fn)
self.assertEqual(res.out.rstrip(), expected)

def check_script_error(self, src, expected):
with tempfile.TemporaryDirectory() as tmpd:
fn = os.path.join(tmpd, 'test.py')
with open(fn, 'wb') as fp:
fp.write(src)
res = script_helper.assert_python_failure(fn)
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)


if __name__ == "__main__":
unittest.main()
95 changes: 82 additions & 13 deletions Lib/test/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,24 +1346,63 @@ def readline():

def test_no_bom_no_encoding_cookie(self):
lines = (
b'# something\n',
b'#!/home/\xc3\xa4/bin/python\n',
b'# something \xe2\x82\xac\n',
b'print(something)\n',
b'do_something(else)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:2]))

def test_no_bom_no_encoding_cookie_first_line_error(self):
lines = (
b'#!/home/\xa4/bin/python\n\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_no_bom_no_encoding_cookie_second_line_error(self):
lines = (
b'#!/usr/bin/python\n',
b'# something \xe2\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_bom_no_cookie(self):
lines = (
b'\xef\xbb\xbf# something\n',
b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
b'print(something)\n',
b'do_something(else)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8-sig')
self.assertEqual(consumed_lines,
[b'# something\n', b'print(something)\n'])
[b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])

def test_bom_no_cookie_first_line_error(self):
lines = (
b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_bom_no_cookie_second_line_error(self):
lines = (
b'\xef\xbb\xbf#!/usr/bin/python\n',
b'# something \xe2\n',
b'print(something)\n',
b'do_something(else)\n'
)
with self.assertRaises(SyntaxError):
tokenize.detect_encoding(self.get_readline(lines))

def test_cookie_first_line_no_bom(self):
lines = (
Expand Down Expand Up @@ -1439,27 +1478,58 @@ def test_cookie_second_line_noncommented_first_line(self):
expected = [b"print('\xc2\xa3')\n"]
self.assertEqual(consumed_lines, expected)

def test_cookie_second_line_commented_first_line(self):
def test_cookie_second_line_empty_first_line(self):
lines = (
b"#print('\xc2\xa3')\n",
b'\n',
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected)

def test_cookie_second_line_empty_first_line(self):
def test_cookie_third_line(self):
lines = (
b'\n',
b'# vim: set fileencoding=iso8859-15 :\n',
b"print('\xe2\x82\xac')\n"
b'#!/home/\xc3\xa4/bin/python\n',
b'# something\n',
b'# vim: set fileencoding=ascii :\n',
b'print(something)\n',
b'do_something(else)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:2]))

def test_double_coding_line(self):
# If the first line matches the second line is ignored.
lines = (
b'#coding:iso8859-15\n',
b'#coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
self.assertEqual(consumed_lines, expected)
self.assertEqual(consumed_lines, list(lines[:1]))

def test_double_coding_same_line(self):
lines = (
b'#coding:iso8859-15 coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'iso8859-15')
self.assertEqual(consumed_lines, list(lines[:1]))

def test_double_coding_utf8(self):
lines = (
b'#coding:utf-8\n',
b'#coding:latin1\n',
b'print(something)\n'
)
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
self.assertEqual(encoding, 'utf-8')
self.assertEqual(consumed_lines, list(lines[:1]))

def test_latin1_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
Expand All @@ -1485,7 +1555,6 @@ def test_syntaxerror_latin1(self):
readline = self.get_readline(lines)
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)


def test_utf8_normalization(self):
# See get_normal_name() in Parser/tokenizer/helpers.c.
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
Expand Down
Loading