Skip to content

Commit 2e73fb6

Browse files
serhiy-storchakamiss-islington
authored andcommitted
pythongh-63161: Add more tests for source encoding (pythonGH-139440)
(cherry picked from commit b2f5ad0) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent 6260b6a commit 2e73fb6

File tree

2 files changed

+177
-20
lines changed

2 files changed

+177
-20
lines changed

Lib/test/test_source_encoding.py

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ def test_tokenizer_fstring_warning_in_first_line(self):
173173
os.unlink(TESTFN)
174174

175175

176+
BUFSIZ = 2**13
177+
176178
class AbstractSourceEncodingTest:
177179

178180
def test_default_coding(self):
@@ -185,14 +187,20 @@ def test_first_coding_line(self):
185187
self.check_script_output(src, br"'\xc3\u20ac'")
186188

187189
def test_second_coding_line(self):
188-
src = (b'#\n'
190+
src = (b'#!/usr/bin/python\n'
191+
b'#coding:iso8859-15\n'
192+
b'print(ascii("\xc3\xa4"))\n')
193+
self.check_script_output(src, br"'\xc3\u20ac'")
194+
195+
def test_second_coding_line_empty_first_line(self):
196+
src = (b'\n'
189197
b'#coding:iso8859-15\n'
190198
b'print(ascii("\xc3\xa4"))\n')
191199
self.check_script_output(src, br"'\xc3\u20ac'")
192200

193201
def test_third_coding_line(self):
194202
# Only first two lines are tested for a magic comment.
195-
src = (b'#\n'
203+
src = (b'#!/usr/bin/python\n'
196204
b'#\n'
197205
b'#coding:iso8859-15\n'
198206
b'print(ascii("\xc3\xa4"))\n')
@@ -210,13 +218,52 @@ def test_double_coding_same_line(self):
210218
b'print(ascii("\xc3\xa4"))\n')
211219
self.check_script_output(src, br"'\xc3\u20ac'")
212220

221+
def test_double_coding_utf8(self):
222+
src = (b'#coding:utf-8\n'
223+
b'#coding:latin1\n'
224+
b'print(ascii("\xc3\xa4"))\n')
225+
self.check_script_output(src, br"'\xe4'")
226+
227+
def test_long_first_coding_line(self):
228+
src = (b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
229+
b'print(ascii("\xc3\xa4"))\n')
230+
self.check_script_output(src, br"'\xc3\u20ac'")
231+
232+
def test_long_second_coding_line(self):
233+
src = (b'#!/usr/bin/python\n'
234+
b'#' + b' '*BUFSIZ + b'coding:iso8859-15\n'
235+
b'print(ascii("\xc3\xa4"))\n')
236+
self.check_script_output(src, br"'\xc3\u20ac'")
237+
238+
def test_long_coding_line(self):
239+
src = (b'#coding:iso-8859-15' + b' '*BUFSIZ + b'\n'
240+
b'print(ascii("\xc3\xa4"))\n')
241+
self.check_script_output(src, br"'\xc3\u20ac'")
242+
243+
def test_long_coding_name(self):
244+
src = (b'#coding:iso-8859-1-' + b'x'*BUFSIZ + b'\n'
245+
b'print(ascii("\xc3\xa4"))\n')
246+
self.check_script_output(src, br"'\xc3\xa4'")
247+
248+
def test_long_first_utf8_line(self):
249+
src = b'#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
250+
self.check_script_output(src, b'')
251+
src = b'# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
252+
self.check_script_output(src, b'')
253+
254+
def test_long_second_utf8_line(self):
255+
src = b'\n#' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
256+
self.check_script_output(src, b'')
257+
src = b'\n# ' + b'\xc3\xa4'*(BUFSIZ//2) + b'\n'
258+
self.check_script_output(src, b'')
259+
213260
def test_first_non_utf8_coding_line(self):
214261
src = (b'#coding:iso-8859-15 \xa4\n'
215262
b'print(ascii("\xc3\xa4"))\n')
216263
self.check_script_output(src, br"'\xc3\u20ac'")
217264

218265
def test_second_non_utf8_coding_line(self):
219-
src = (b'\n'
266+
src = (b'#!/usr/bin/python\n'
220267
b'#coding:iso-8859-15 \xa4\n'
221268
b'print(ascii("\xc3\xa4"))\n')
222269
self.check_script_output(src, br"'\xc3\u20ac'")
@@ -225,27 +272,56 @@ def test_utf8_bom(self):
225272
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
226273
self.check_script_output(src, br"'\xe4'")
227274

275+
def test_utf8_bom_utf8_comments(self):
276+
src = (b'\xef\xbb\xbf#\xc3\xa4\n'
277+
b'#\xc3\xa4\n'
278+
b'print(ascii("\xc3\xa4"))\n')
279+
self.check_script_output(src, br"'\xe4'")
280+
228281
def test_utf8_bom_and_utf8_coding_line(self):
229282
src = (b'\xef\xbb\xbf#coding:utf-8\n'
230283
b'print(ascii("\xc3\xa4"))\n')
231284
self.check_script_output(src, br"'\xe4'")
232285

286+
def test_utf8_non_utf8_comment_line_error(self):
287+
src = (b'#coding: utf8\n'
288+
b'#\n'
289+
b'#\xa4\n'
290+
b'raise RuntimeError\n')
291+
self.check_script_error(src,
292+
br"'utf-8' codec can't decode byte|"
293+
br"encoding problem: utf8")
294+
233295
def test_crlf(self):
234296
src = (b'print(ascii("""\r\n"""))\n')
235-
out = self.check_script_output(src, br"'\n'")
297+
self.check_script_output(src, br"'\n'")
236298

237299
def test_crcrlf(self):
238300
src = (b'print(ascii("""\r\r\n"""))\n')
239-
out = self.check_script_output(src, br"'\n\n'")
301+
self.check_script_output(src, br"'\n\n'")
240302

241303
def test_crcrcrlf(self):
242304
src = (b'print(ascii("""\r\r\r\n"""))\n')
243-
out = self.check_script_output(src, br"'\n\n\n'")
305+
self.check_script_output(src, br"'\n\n\n'")
244306

245307
def test_crcrcrlf2(self):
246308
src = (b'#coding:iso-8859-1\n'
247309
b'print(ascii("""\r\r\r\n"""))\n')
248-
out = self.check_script_output(src, br"'\n\n\n'")
310+
self.check_script_output(src, br"'\n\n\n'")
311+
312+
def test_nul_in_first_coding_line(self):
313+
src = (b'#coding:iso8859-15\x00\n'
314+
b'\n'
315+
b'\n'
316+
b'raise RuntimeError\n')
317+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
318+
319+
def test_nul_in_second_coding_line(self):
320+
src = (b'#!/usr/bin/python\n'
321+
b'#coding:iso8859-15\x00\n'
322+
b'\n'
323+
b'raise RuntimeError\n')
324+
self.check_script_error(src, br"source code (string )?cannot contain null bytes")
249325

250326

251327
class UTF8ValidatorTest(unittest.TestCase):
@@ -325,6 +401,10 @@ def check_script_output(self, src, expected):
325401
out = stdout.getvalue().encode('latin1')
326402
self.assertEqual(out.rstrip(), expected)
327403

404+
def check_script_error(self, src, expected):
405+
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
406+
exec(src)
407+
328408

329409
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
330410

@@ -336,6 +416,14 @@ def check_script_output(self, src, expected):
336416
res = script_helper.assert_python_ok(fn)
337417
self.assertEqual(res.out.rstrip(), expected)
338418

419+
def check_script_error(self, src, expected):
420+
with tempfile.TemporaryDirectory() as tmpd:
421+
fn = os.path.join(tmpd, 'test.py')
422+
with open(fn, 'wb') as fp:
423+
fp.write(src)
424+
res = script_helper.assert_python_failure(fn)
425+
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
426+
339427

340428
if __name__ == "__main__":
341429
unittest.main()

Lib/test/test_tokenize.py

Lines changed: 82 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,24 +1342,63 @@ def readline():
13421342

13431343
def test_no_bom_no_encoding_cookie(self):
13441344
lines = (
1345-
b'# something\n',
1345+
b'#!/home/\xc3\xa4/bin/python\n',
1346+
b'# something \xe2\x82\xac\n',
13461347
b'print(something)\n',
13471348
b'do_something(else)\n'
13481349
)
13491350
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13501351
self.assertEqual(encoding, 'utf-8')
13511352
self.assertEqual(consumed_lines, list(lines[:2]))
13521353

1354+
def test_no_bom_no_encoding_cookie_first_line_error(self):
1355+
lines = (
1356+
b'#!/home/\xa4/bin/python\n\n',
1357+
b'print(something)\n',
1358+
b'do_something(else)\n'
1359+
)
1360+
with self.assertRaises(SyntaxError):
1361+
tokenize.detect_encoding(self.get_readline(lines))
1362+
1363+
def test_no_bom_no_encoding_cookie_second_line_error(self):
1364+
lines = (
1365+
b'#!/usr/bin/python\n',
1366+
b'# something \xe2\n',
1367+
b'print(something)\n',
1368+
b'do_something(else)\n'
1369+
)
1370+
with self.assertRaises(SyntaxError):
1371+
tokenize.detect_encoding(self.get_readline(lines))
1372+
13531373
def test_bom_no_cookie(self):
13541374
lines = (
1355-
b'\xef\xbb\xbf# something\n',
1375+
b'\xef\xbb\xbf#!/home/\xc3\xa4/bin/python\n',
13561376
b'print(something)\n',
13571377
b'do_something(else)\n'
13581378
)
13591379
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
13601380
self.assertEqual(encoding, 'utf-8-sig')
13611381
self.assertEqual(consumed_lines,
1362-
[b'# something\n', b'print(something)\n'])
1382+
[b'#!/home/\xc3\xa4/bin/python\n', b'print(something)\n'])
1383+
1384+
def test_bom_no_cookie_first_line_error(self):
1385+
lines = (
1386+
b'\xef\xbb\xbf#!/home/\xa4/bin/python\n',
1387+
b'print(something)\n',
1388+
b'do_something(else)\n'
1389+
)
1390+
with self.assertRaises(SyntaxError):
1391+
tokenize.detect_encoding(self.get_readline(lines))
1392+
1393+
def test_bom_no_cookie_second_line_error(self):
1394+
lines = (
1395+
b'\xef\xbb\xbf#!/usr/bin/python\n',
1396+
b'# something \xe2\n',
1397+
b'print(something)\n',
1398+
b'do_something(else)\n'
1399+
)
1400+
with self.assertRaises(SyntaxError):
1401+
tokenize.detect_encoding(self.get_readline(lines))
13631402

13641403
def test_cookie_first_line_no_bom(self):
13651404
lines = (
@@ -1435,27 +1474,58 @@ def test_cookie_second_line_noncommented_first_line(self):
14351474
expected = [b"print('\xc2\xa3')\n"]
14361475
self.assertEqual(consumed_lines, expected)
14371476

1438-
def test_cookie_second_line_commented_first_line(self):
1477+
def test_cookie_second_line_empty_first_line(self):
14391478
lines = (
1440-
b"#print('\xc2\xa3')\n",
1479+
b'\n',
14411480
b'# vim: set fileencoding=iso8859-15 :\n',
14421481
b"print('\xe2\x82\xac')\n"
14431482
)
14441483
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14451484
self.assertEqual(encoding, 'iso8859-15')
1446-
expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1485+
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
14471486
self.assertEqual(consumed_lines, expected)
14481487

1449-
def test_cookie_second_line_empty_first_line(self):
1488+
def test_cookie_third_line(self):
14501489
lines = (
1451-
b'\n',
1452-
b'# vim: set fileencoding=iso8859-15 :\n',
1453-
b"print('\xe2\x82\xac')\n"
1490+
b'#!/home/\xc3\xa4/bin/python\n',
1491+
b'# something\n',
1492+
b'# vim: set fileencoding=ascii :\n',
1493+
b'print(something)\n',
1494+
b'do_something(else)\n'
1495+
)
1496+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1497+
self.assertEqual(encoding, 'utf-8')
1498+
self.assertEqual(consumed_lines, list(lines[:2]))
1499+
1500+
def test_double_coding_line(self):
1501+
# If the first line matches the second line is ignored.
1502+
lines = (
1503+
b'#coding:iso8859-15\n',
1504+
b'#coding:latin1\n',
1505+
b'print(something)\n'
14541506
)
14551507
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
14561508
self.assertEqual(encoding, 'iso8859-15')
1457-
expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1458-
self.assertEqual(consumed_lines, expected)
1509+
self.assertEqual(consumed_lines, list(lines[:1]))
1510+
1511+
def test_double_coding_same_line(self):
1512+
lines = (
1513+
b'#coding:iso8859-15 coding:latin1\n',
1514+
b'print(something)\n'
1515+
)
1516+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1517+
self.assertEqual(encoding, 'iso8859-15')
1518+
self.assertEqual(consumed_lines, list(lines[:1]))
1519+
1520+
def test_double_coding_utf8(self):
1521+
lines = (
1522+
b'#coding:utf-8\n',
1523+
b'#coding:latin1\n',
1524+
b'print(something)\n'
1525+
)
1526+
encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
1527+
self.assertEqual(encoding, 'utf-8')
1528+
self.assertEqual(consumed_lines, list(lines[:1]))
14591529

14601530
def test_latin1_normalization(self):
14611531
# See get_normal_name() in Parser/tokenizer/helpers.c.
@@ -1481,7 +1551,6 @@ def test_syntaxerror_latin1(self):
14811551
readline = self.get_readline(lines)
14821552
self.assertRaises(SyntaxError, tokenize.detect_encoding, readline)
14831553

1484-
14851554
def test_utf8_normalization(self):
14861555
# See get_normal_name() in Parser/tokenizer/helpers.c.
14871556
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

0 commit comments

Comments
 (0)