diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py index 61b00778f8361c..c5280673ab8f23 100644 --- a/Lib/test/test_source_encoding.py +++ b/Lib/test/test_source_encoding.py @@ -173,6 +173,8 @@ def test_tokenizer_fstring_warning_in_first_line(self): os.unlink(TESTFN) +BUFSIZ = 2**13 + class AbstractSourceEncodingTest: def test_default_coding(self): @@ -185,14 +187,20 @@ def test_first_coding_line(self): self.check_script_output(src, br"'\xc3\u20ac'") def test_second_coding_line(self): - src = (b'#\n' + src = (b'#!/usr/bin/python\n' + b'#coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\u20ac'") + + def test_second_coding_line_empty_first_line(self): + src = (b'\n' b'#coding:iso8859-15\n' b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xc3\u20ac'") def test_third_coding_line(self): # Only first two lines are tested for a magic comment. - src = (b'#\n' + src = (b'#!/usr/bin/python\n' b'#\n' b'#coding:iso8859-15\n' b'print(ascii("\xc3\xa4"))\n') @@ -210,13 +218,52 @@ def test_double_coding_same_line(self): b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xc3\u20ac'") + def test_double_coding_utf8(self): + src = (b'#coding:utf-8\n' + b'#coding:latin1\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xe4'") + + def test_long_first_coding_line(self): + src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\u20ac'") + + def test_long_second_coding_line(self): + src = (b'#!/usr/bin/python\n' + b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\u20ac'") + + def test_long_coding_line(self): + src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\u20ac'") + + def test_long_coding_name(self): + src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xc3\xa4'") + + def test_long_first_utf8_line(self): + src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + self.check_script_output(src, b'') + src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + self.check_script_output(src, b'') + + def test_long_second_utf8_line(self): + src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + self.check_script_output(src, b'') + src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n' + self.check_script_output(src, b'') + def test_first_non_utf8_coding_line(self): src = (b'#coding:iso-8859-15 \xa4\n' b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xc3\u20ac'") def test_second_non_utf8_coding_line(self): - src = (b'\n' + src = (b'#!/usr/bin/python\n' b'#coding:iso-8859-15 \xa4\n' b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xc3\u20ac'") @@ -225,27 +272,56 @@ def test_utf8_bom(self): src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xe4'") + def test_utf8_bom_utf8_comments(self): + src = (b'\xef\xbb\xbf#\xc3\xa4\n' + b'#\xc3\xa4\n' + b'print(ascii("\xc3\xa4"))\n') + self.check_script_output(src, br"'\xe4'") + def test_utf8_bom_and_utf8_coding_line(self): src = (b'\xef\xbb\xbf#coding:utf-8\n' b'print(ascii("\xc3\xa4"))\n') self.check_script_output(src, br"'\xe4'") + def test_utf8_non_utf8_comment_line_error(self): + src = (b'#coding: utf8\n' + b'#\n' + b'#\xa4\n' + b'raise RuntimeError\n') + self.check_script_error(src, + br"'utf-8' codec can't decode byte|" + br"encoding problem: utf8") + def test_crlf(self): src = (b'print(ascii("""\r\n"""))\n') - out = self.check_script_output(src, br"'\n'") + self.check_script_output(src, br"'\n'") def test_crcrlf(self): src = (b'print(ascii("""\r\r\n"""))\n') - out = self.check_script_output(src, br"'\n\n'") + self.check_script_output(src, br"'\n\n'") def test_crcrcrlf(self): src = (b'print(ascii("""\r\r\r\n"""))\n') - out = self.check_script_output(src, br"'\n\n\n'") + self.check_script_output(src, br"'\n\n\n'") def test_crcrcrlf2(self): src = (b'#coding:iso-8859-1\n' b'print(ascii("""\r\r\r\n"""))\n') - out = self.check_script_output(src, br"'\n\n\n'") + self.check_script_output(src, br"'\n\n\n'") + + def test_nul_in_first_coding_line(self): + src = (b'#coding:iso8859-15\x00\n' + b'\n' + b'\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"source code (string )?cannot contain null bytes") + + def test_nul_in_second_coding_line(self): + src = (b'#!/usr/bin/python\n' + b'#coding:iso8859-15\x00\n' + b'\n' + b'raise RuntimeError\n') + self.check_script_error(src, br"source code (string )?cannot contain null bytes") class UTF8ValidatorTest(unittest.TestCase): @@ -325,6 +401,10 @@ def check_script_output(self, src, expected): out = stdout.getvalue().encode('latin1') self.assertEqual(out.rstrip(), expected) + def check_script_error(self, src, expected): + with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm: + exec(src) + class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase): @@ -336,6 +416,14 @@ def check_script_output(self, src, expected): res = script_helper.assert_python_ok(fn) self.assertEqual(res.out.rstrip(), expected) + def check_script_error(self, src, expected): + with tempfile.TemporaryDirectory() as tmpd: + fn = os.path.join(tmpd, 'test.py') + with open(fn, 'wb') as fp: + fp.write(src) + res = script_helper.assert_python_failure(fn) + self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected) + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index e9701eb130d3e3..e9a9ee4469d877 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -1342,7 +1342,8 @@ def readline(): def test_no_bom_no_encoding_cookie(self): lines = ( - b'# something\n', + b'#!/home/\xc3\xa4/bin/python\n', + b'# something \xe2\x82\xac\n', b'print(something)\n', b'do_something(else)\n' ) @@ -1350,16 +1351,54 @@ def test_no_bom_no_encoding_cookie(self): self.assertEqual(encoding, 'utf-8') self.assertEqual(consumed_lines, list(lines[:2])) + def test_no_bom_no_encoding_cookie_first_line_error(self): + lines = ( + b'#!/home/\xa4/bin/python\n\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_no_bom_no_encoding_cookie_second_line_error(self): + lines = ( + b'#!/usr/bin/python\n', + b'# something \xe2\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + def test_bom_no_cookie(self): lines = ( - b'\xef\xbb\xbf# something\n', + b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n', b'print(something)\n', b'do_something(else)\n' ) encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'utf-8-sig') self.assertEqual(consumed_lines, - [b'# something\n', b'print(something)\n']) + [b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n']) + + def test_bom_no_cookie_first_line_error(self): + lines = ( + b'\xef\xbb\xbf#!/home/\xa4/bin/python\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) + + def test_bom_no_cookie_second_line_error(self): + lines = ( + b'\xef\xbb\xbf#!/usr/bin/python\n', + b'# something \xe2\n', + b'print(something)\n', + b'do_something(else)\n' + ) + with self.assertRaises(SyntaxError): + tokenize.detect_encoding(self.get_readline(lines)) def test_cookie_first_line_no_bom(self): lines = ( @@ -1435,27 +1474,58 @@ def test_cookie_second_line_noncommented_first_line(self): expected = [b"print('\xc2\xa3')\n"] self.assertEqual(consumed_lines, expected) - def test_cookie_second_line_commented_first_line(self): + def test_cookie_second_line_empty_first_line(self): lines = ( - b"#print('\xc2\xa3')\n", + b'\n', b'# vim: set fileencoding=iso8859-15 :\n', b"print('\xe2\x82\xac')\n" ) encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') - expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] + expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] self.assertEqual(consumed_lines, expected) - def test_cookie_second_line_empty_first_line(self): + def test_cookie_third_line(self): lines = ( - b'\n', - b'# vim: set fileencoding=iso8859-15 :\n', - b"print('\xe2\x82\xac')\n" + b'#!/home/\xc3\xa4/bin/python\n', + b'# something\n', + b'# vim: set fileencoding=ascii :\n', + b'print(something)\n', + b'do_something(else)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, list(lines[:2])) + + def test_double_coding_line(self): + # If the first line matches the second line is ignored. + lines = ( + b'#coding:iso8859-15\n', + b'#coding:latin1\n', + b'print(something)\n' ) encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) self.assertEqual(encoding, 'iso8859-15') - expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] - self.assertEqual(consumed_lines, expected) + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_double_coding_same_line(self): + lines = ( + b'#coding:iso8859-15 coding:latin1\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'iso8859-15') + self.assertEqual(consumed_lines, list(lines[:1])) + + def test_double_coding_utf8(self): + lines = ( + b'#coding:utf-8\n', + b'#coding:latin1\n', + b'print(something)\n' + ) + encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines)) + self.assertEqual(encoding, 'utf-8') + self.assertEqual(consumed_lines, list(lines[:1])) def test_latin1_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. @@ -1481,7 +1551,6 @@ def test_syntaxerror_latin1(self): readline = self.get_readline(lines) self.assertRaises(SyntaxError, tokenize.detect_encoding, readline) - def test_utf8_normalization(self): # See get_normal_name() in Parser/tokenizer/helpers.c. encodings = ("utf-8", "utf-8-mac", "utf-8-unix")