Skip to content

Commit 5c942f1

Browse files
gh-63161: Fix PEP 263 support (GH-139481)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding. * Include the decoding error position for default encoding in SyntaxError.
1 parent d0b18b1 commit 5c942f1

File tree

9 files changed

+210
-46
lines changed

9 files changed

+210
-46
lines changed

Lib/test/test_exceptions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
224224
if not isinstance(src, str):
225225
src = src.decode(encoding, 'replace')
226226
line = src.split('\n')[lineno-1]
227+
if lineno == 1:
228+
line = line.removeprefix('\ufeff')
227229
self.assertIn(line, cm.exception.text)
228230

229231
def test_error_offset_continuation_characters(self):
@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
239241
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
240242
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
241243
2, 19, encoding='cp1251')
242-
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
244+
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
245+
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
246+
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
243247
check('x = "a', 1, 5)
244248
check('lambda x: x = 2', 1, 1)
245249
check('f{a + b + c}', 1, 2)
@@ -287,7 +291,7 @@ def baz():
287291
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
288292
check("(1+)", 1, 4)
289293
check("[interesting\nfoo()\n", 1, 1)
290-
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
294+
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
291295
check("""f'''
292296
{
293297
(123_a)

Lib/test/test_source_encoding.py

Lines changed: 112 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
import unittest
4-
from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
4+
from test import support
5+
from test.support import script_helper
56
from test.support.os_helper import TESTFN, unlink, rmtree
67
from test.support.import_helper import unload
78
import importlib
@@ -64,7 +65,7 @@ def test_issue7820(self):
6465
# two bytes in common with the UTF-8 BOM
6566
self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
6667

67-
@requires_subprocess()
68+
@support.requires_subprocess()
6869
def test_20731(self):
6970
sub = subprocess.Popen([sys.executable,
7071
os.path.join(os.path.dirname(__file__),
@@ -267,6 +268,17 @@ def test_second_non_utf8_coding_line(self):
267268
b'print(ascii("\xc3\xa4"))\n')
268269
self.check_script_output(src, br"'\xc3\u20ac'")
269270

271+
def test_first_utf8_coding_line_error(self):
272+
src = (b'#coding:ascii \xc3\xa4\n'
273+
b'raise RuntimeError\n')
274+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
275+
276+
def test_second_utf8_coding_line_error(self):
277+
src = (b'#!/usr/bin/python\n'
278+
b'#coding:ascii \xc3\xa4\n'
279+
b'raise RuntimeError\n')
280+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
281+
270282
def test_utf8_bom(self):
271283
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
272284
self.check_script_output(src, br"'\xe4'")
@@ -282,10 +294,80 @@ def test_utf8_bom_and_utf8_coding_line(self):
282294
b'print(ascii("\xc3\xa4"))\n')
283295
self.check_script_output(src, br"'\xe4'")
284296

285-
def test_utf8_non_utf8_comment_line_error(self):
297+
def test_utf8_bom_and_non_utf8_first_coding_line(self):
298+
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
299+
b'raise RuntimeError\n')
300+
self.check_script_error(src,
301+
br"encoding problem: iso-8859-15 with BOM",
302+
lineno=1)
303+
304+
def test_utf8_bom_and_non_utf8_second_coding_line(self):
305+
src = (b'\xef\xbb\xbf#first\n'
306+
b'#coding:iso-8859-15\n'
307+
b'raise RuntimeError\n')
308+
self.check_script_error(src,
309+
br"encoding problem: iso-8859-15 with BOM",
310+
lineno=2)
311+
312+
def test_non_utf8_shebang(self):
313+
src = (b'#!/home/\xa4/bin/python\n'
314+
b'#coding:iso-8859-15\n'
315+
b'print(ascii("\xc3\xa4"))\n')
316+
self.check_script_output(src, br"'\xc3\u20ac'")
317+
318+
def test_utf8_shebang_error(self):
319+
src = (b'#!/home/\xc3\xa4/bin/python\n'
320+
b'#coding:ascii\n'
321+
b'raise RuntimeError\n')
322+
self.check_script_error(src, br"(\(unicode error\) )?'ascii' codec can't decode byte")
323+
324+
def test_non_utf8_shebang_error(self):
325+
src = (b'#!/home/\xa4/bin/python\n'
326+
b'raise RuntimeError\n')
327+
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
328+
lineno=1)
329+
330+
def test_non_utf8_second_line_error(self):
331+
src = (b'#first\n'
332+
b'#second\xa4\n'
333+
b'raise RuntimeError\n')
334+
self.check_script_error(src,
335+
br"Non-UTF-8 code starting with .* on line 2",
336+
lineno=2)
337+
338+
def test_non_utf8_third_line_error(self):
339+
src = (b'#first\n'
340+
b'#second\n'
341+
b'#third\xa4\n'
342+
b'raise RuntimeError\n')
343+
self.check_script_error(src,
344+
br"Non-UTF-8 code starting with .* on line 3",
345+
lineno=3)
346+
347+
def test_utf8_bom_non_utf8_third_line_error(self):
348+
src = (b'\xef\xbb\xbf#first\n'
349+
b'#second\n'
350+
b'#third\xa4\n'
351+
b'raise RuntimeError\n')
352+
self.check_script_error(src,
353+
br"Non-UTF-8 code starting with .* on line 3|"
354+
br"'utf-8' codec can't decode byte",
355+
lineno=3)
356+
357+
def test_utf_8_non_utf8_third_line_error(self):
358+
src = (b'#coding: utf-8\n'
359+
b'#second\n'
360+
b'#third\xa4\n'
361+
b'raise RuntimeError\n')
362+
self.check_script_error(src,
363+
br"Non-UTF-8 code starting with .* on line 3|"
364+
br"'utf-8' codec can't decode byte",
365+
lineno=3)
366+
367+
def test_utf8_non_utf8_third_line_error(self):
286368
src = (b'#coding: utf8\n'
287-
b'#\n'
288-
b'#\xa4\n'
369+
b'#second\n'
370+
b'#third\xa4\n'
289371
b'raise RuntimeError\n')
290372
self.check_script_error(src,
291373
br"'utf-8' codec can't decode byte|"
@@ -326,7 +408,7 @@ def test_nul_in_second_coding_line(self):
326408
class UTF8ValidatorTest(unittest.TestCase):
327409
@unittest.skipIf(not sys.platform.startswith("linux"),
328410
"Too slow to run on non-Linux platforms")
329-
@requires_resource('cpu')
411+
@support.requires_resource('cpu')
330412
def test_invalid_utf8(self):
331413
# This is a port of test_utf8_decode_invalid_sequences in
332414
# test_unicode.py to exercise the separate utf8 validator in
@@ -392,19 +474,29 @@ def check(content):
392474
check(b'\xF4'+cb+b'\xBF\xBF')
393475

394476

477+
@support.force_not_colorized_test_class
395478
class BytesSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
396479

397480
def check_script_output(self, src, expected):
398-
with captured_stdout() as stdout:
481+
with support.captured_stdout() as stdout:
399482
exec(src)
400483
out = stdout.getvalue().encode('latin1')
401484
self.assertEqual(out.rstrip(), expected)
402485

403-
def check_script_error(self, src, expected):
404-
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
486+
def check_script_error(self, src, expected, lineno=...):
487+
with self.assertRaises(SyntaxError) as cm:
405488
exec(src)
489+
exc = cm.exception
490+
self.assertRegex(str(exc), expected.decode())
491+
if lineno is not ...:
492+
self.assertEqual(exc.lineno, lineno)
493+
line = src.splitlines()[lineno-1].decode(errors='replace')
494+
if lineno == 1:
495+
line = line.removeprefix('\ufeff')
496+
self.assertEqual(line, exc.text)
406497

407498

499+
@support.force_not_colorized_test_class
408500
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
409501

410502
def check_script_output(self, src, expected):
@@ -415,13 +507,22 @@ def check_script_output(self, src, expected):
415507
res = script_helper.assert_python_ok(fn)
416508
self.assertEqual(res.out.rstrip(), expected)
417509

418-
def check_script_error(self, src, expected):
510+
def check_script_error(self, src, expected, lineno=...):
419511
with tempfile.TemporaryDirectory() as tmpd:
420512
fn = os.path.join(tmpd, 'test.py')
421513
with open(fn, 'wb') as fp:
422514
fp.write(src)
423515
res = script_helper.assert_python_failure(fn)
424-
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
516+
err = res.err.rstrip()
517+
self.assertRegex(err.splitlines()[-1], b'SyntaxError: ' + expected)
518+
if lineno is not ...:
519+
self.assertIn(f', line {lineno}\n'.encode(),
520+
err.replace(os.linesep.encode(), b'\n'))
521+
line = src.splitlines()[lineno-1].decode(errors='replace')
522+
if lineno == 1:
523+
line = line.removeprefix('\ufeff')
524+
self.assertIn(line.encode(), err)
525+
425526

426527

427528
if __name__ == "__main__":
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
2+
encoding is specified. Detect decoding error in comments for default (UTF-8)
3+
encoding. Show the line and position of decoding error for default encoding
4+
in a traceback. Show the line containing the coding cookie when it conflicts
5+
with the BOM in a traceback.

Parser/pegen_errors.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <errcode.h>
33

44
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
5+
#include "pycore_runtime.h" // _Py_ID()
56
#include "lexer/state.h"
67
#include "lexer/lexer.h"
78
#include "pegen.h"
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
2324
PyObject *value;
2425
PyObject *tback;
2526
PyErr_Fetch(&type, &value, &tback);
27+
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
28+
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
29+
goto error;
30+
}
31+
PyErr_Restore(type, value, tback);
32+
return;
33+
}
2634
errstr = PyObject_Str(value);
2735
if (!errstr) {
2836
goto error;

Parser/tokenizer/file_tokenizer.c

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
282282
}
283283

284284
static int
285-
tok_underflow_file(struct tok_state *tok) {
286-
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
287-
tok->cur = tok->inp = tok->buf;
288-
}
285+
tok_underflow_file(struct tok_state *tok)
286+
{
289287
if (tok->decoding_state == STATE_INIT) {
290288
/* We have not yet determined the encoding.
291289
If an encoding is found, use the file-pointer
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
296294
}
297295
assert(tok->decoding_state != STATE_INIT);
298296
}
297+
int raw = tok->decoding_readline == NULL;
298+
if (raw && tok->decoding_state != STATE_NORMAL) {
299+
/* Keep the first line in the buffer to validate it later if
300+
* the encoding has not yet been determined. */
301+
}
302+
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
303+
tok->cur = tok->inp = tok->buf;
304+
}
299305
/* Read until '\n' or EOF */
300-
if (tok->decoding_readline != NULL) {
306+
if (!raw) {
301307
/* We already have a codec associated with this input. */
302308
if (!tok_readline_recode(tok)) {
303309
return 0;
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
328334

329335
ADVANCE_LINENO();
330336
if (tok->decoding_state != STATE_NORMAL) {
331-
if (tok->lineno > 2) {
332-
tok->decoding_state = STATE_NORMAL;
333-
}
334-
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
337+
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
335338
tok, fp_setreadl))
336339
{
337340
return 0;
338341
}
342+
if (tok->lineno >= 2) {
343+
tok->decoding_state = STATE_NORMAL;
344+
}
339345
}
340-
/* The default encoding is UTF-8, so make sure we don't have any
341-
non-UTF-8 sequences in it. */
342-
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
343-
_PyTokenizer_error_ret(tok);
344-
return 0;
346+
if (raw && tok->decoding_state == STATE_NORMAL) {
347+
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
348+
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
349+
if (!tok->encoding) {
350+
/* The default encoding is UTF-8, so make sure we don't have any
351+
non-UTF-8 sequences in it. */
352+
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
353+
_PyTokenizer_error_ret(tok);
354+
return 0;
355+
}
356+
}
357+
else {
358+
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
359+
tok->encoding, NULL);
360+
if (tmp == NULL) {
361+
_PyTokenizer_error_ret(tok);
362+
return 0;
363+
}
364+
Py_DECREF(tmp);
365+
}
345366
}
346367
assert(tok->done == E_OK);
347368
return tok->done == E_OK;

0 commit comments

Comments
 (0)