Skip to content

Commit 3ab168a

Browse files
Include the decoding error position for default encoding in SyntaxError.
1 parent 7e9910e commit 3ab168a

File tree

6 files changed

+108
-40
lines changed

6 files changed

+108
-40
lines changed

Lib/test/test_exceptions.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
224224
if not isinstance(src, str):
225225
src = src.decode(encoding, 'replace')
226226
line = src.split('\n')[lineno-1]
227+
if lineno == 1:
228+
line = line.removeprefix('\ufeff')
227229
self.assertIn(line, cm.exception.text)
228230

229231
def test_error_offset_continuation_characters(self):
@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
239241
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
240242
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
241243
2, 19, encoding='cp1251')
242-
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
244+
check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
245+
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
246+
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
243247
check('x = "a', 1, 5)
244248
check('lambda x: x = 2', 1, 1)
245249
check('f{a + b + c}', 1, 2)
@@ -287,7 +291,7 @@ def baz():
287291
check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
288292
check("(1+)", 1, 4)
289293
check("[interesting\nfoo()\n", 1, 1)
290-
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
294+
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
291295
check("""f'''
292296
{
293297
(123_a)

Lib/test/test_source_encoding.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,21 @@ def test_utf8_bom_and_utf8_coding_line(self):
293293
b'print(ascii("\xc3\xa4"))\n')
294294
self.check_script_output(src, br"'\xe4'")
295295

296+
def test_utf8_bom_and_non_utf8_first_coding_line(self):
297+
src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
298+
b'raise RuntimeError\n')
299+
self.check_script_error(src,
300+
br"encoding problem: iso-8859-15 with BOM",
301+
lineno=1)
302+
303+
def test_utf8_bom_and_non_utf8_second_coding_line(self):
304+
src = (b'\xef\xbb\xbf#first\n'
305+
b'#coding:iso-8859-15\n'
306+
b'raise RuntimeError\n')
307+
self.check_script_error(src,
308+
br"encoding problem: iso-8859-15 with BOM",
309+
lineno=2)
310+
296311
def test_non_utf8_shebang(self):
297312
src = (b'#!/home/\xa4/bin/python\n'
298313
b'#coding:iso-8859-15\n'
@@ -308,45 +323,50 @@ def test_utf8_shebang_error(self):
308323
def test_non_utf8_shebang_error(self):
309324
src = (b'#!/home/\xa4/bin/python\n'
310325
b'raise RuntimeError\n')
311-
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")
326+
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
327+
lineno=1)
312328

313329
def test_non_utf8_second_line_error(self):
314-
src = (b'#\n'
315-
b'#\xa4\n'
330+
src = (b'#first\n'
331+
b'#second\xa4\n'
316332
b'raise RuntimeError\n')
317333
self.check_script_error(src,
318-
br"Non-UTF-8 code starting with .* on line 2")
334+
br"Non-UTF-8 code starting with .* on line 2",
335+
lineno=2)
319336

320337
def test_non_utf8_third_line_error(self):
321-
src = (b'#\n'
322-
b'#\n'
323-
b'#\xa4\n'
338+
src = (b'#first\n'
339+
b'#second\n'
340+
b'#third\xa4\n'
324341
b'raise RuntimeError\n')
325342
self.check_script_error(src,
326-
br"Non-UTF-8 code starting with .* on line 3")
343+
br"Non-UTF-8 code starting with .* on line 3",
344+
lineno=3)
327345

328346
def test_utf8_bom_non_utf8_third_line_error(self):
329-
src = (b'\xef\xbb\xbf#\n'
330-
b'#\n'
331-
b'#\xa4\n'
347+
src = (b'\xef\xbb\xbf#first\n'
348+
b'#second\n'
349+
b'#third\xa4\n'
332350
b'raise RuntimeError\n')
333351
self.check_script_error(src,
334352
br"Non-UTF-8 code starting with .* on line 3|"
335-
br"'utf-8' codec can't decode byte")
353+
br"'utf-8' codec can't decode byte",
354+
lineno=3)
336355

337356
def test_utf_8_non_utf8_third_line_error(self):
338357
src = (b'#coding: utf-8\n'
339-
b'#\n'
340-
b'#\xa4\n'
358+
b'#second\n'
359+
b'#third\xa4\n'
341360
b'raise RuntimeError\n')
342361
self.check_script_error(src,
343362
br"Non-UTF-8 code starting with .* on line 3|"
344-
br"'utf-8' codec can't decode byte")
363+
br"'utf-8' codec can't decode byte",
364+
lineno=3)
345365

346366
def test_utf8_non_utf8_third_line_error(self):
347367
src = (b'#coding: utf8\n'
348-
b'#\n'
349-
b'#\xa4\n'
368+
b'#second\n'
369+
b'#third\xa4\n'
350370
b'raise RuntimeError\n')
351371
self.check_script_error(src,
352372
br"'utf-8' codec can't decode byte|"
@@ -461,9 +481,17 @@ def check_script_output(self, src, expected):
461481
out = stdout.getvalue().encode('latin1')
462482
self.assertEqual(out.rstrip(), expected)
463483

464-
def check_script_error(self, src, expected):
465-
with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
484+
def check_script_error(self, src, expected, lineno=...):
485+
with self.assertRaises(SyntaxError) as cm:
466486
exec(src)
487+
exc = cm.exception
488+
self.assertRegex(str(exc), expected.decode())
489+
if lineno is not ...:
490+
self.assertEqual(exc.lineno, lineno)
491+
line = src.splitlines()[lineno-1].decode(errors='replace')
492+
if lineno == 1:
493+
line = line.removeprefix('\ufeff')
494+
self.assertEqual(line, exc.text)
467495

468496

469497
class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
@@ -476,13 +504,21 @@ def check_script_output(self, src, expected):
476504
res = script_helper.assert_python_ok(fn)
477505
self.assertEqual(res.out.rstrip(), expected)
478506

479-
def check_script_error(self, src, expected):
507+
def check_script_error(self, src, expected, lineno=...):
480508
with tempfile.TemporaryDirectory() as tmpd:
481509
fn = os.path.join(tmpd, 'test.py')
482510
with open(fn, 'wb') as fp:
483511
fp.write(src)
484512
res = script_helper.assert_python_failure(fn)
485-
self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
513+
err = res.err.rstrip()
514+
self.assertRegex(err.splitlines()[-1], b'SyntaxError.*?' + expected)
515+
if lineno is not ...:
516+
self.assertIn(f', line {lineno}\n'.encode(), err)
517+
line = src.splitlines()[lineno-1].decode(errors='replace')
518+
if lineno == 1:
519+
line = line.removeprefix('\ufeff')
520+
self.assertIn(line.encode(), err)
521+
486522

487523

488524
if __name__ == "__main__":
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
22
encoding is specified. Detect decoding error in comments for default (UTF-8)
3-
encoding.
3+
encoding. Show the line and position of decoding error for default encoding
4+
in a traceback. Show the line containing the coding cookie when it conflicts
5+
with the BOM in a traceback.

Parser/pegen_errors.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include <errcode.h>
33

44
#include "pycore_pyerrors.h" // _PyErr_ProgramDecodedTextObject()
5+
#include "pycore_runtime.h" // _Py_ID()
56
#include "lexer/state.h"
67
#include "lexer/lexer.h"
78
#include "pegen.h"
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
2324
PyObject *value;
2425
PyObject *tback;
2526
PyErr_Fetch(&type, &value, &tback);
27+
if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
28+
if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
29+
goto error;
30+
}
31+
PyErr_Restore(type, value, tback);
32+
return;
33+
}
2634
errstr = PyObject_Str(value);
2735
if (!errstr) {
2836
goto error;

Parser/tokenizer/helpers.c

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
4747
goto error;
4848
}
4949

50-
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
51-
col_offset, errtext, tok->lineno, end_col_offset);
50+
args = Py_BuildValue("(O(OiiNii))", errmsg,
51+
tok->filename ? tok->filename : Py_None,
52+
tok->lineno, col_offset, errtext,
53+
tok->lineno, end_col_offset);
5254
if (args) {
5355
PyErr_SetObject(PyExc_SyntaxError, args);
5456
Py_DECREF(args);
@@ -422,10 +424,12 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
422424
tok->encoding = cs;
423425
} else { /* then, compare cs with BOM */
424426
if (strcmp(tok->encoding, cs) != 0) {
425-
_PyTokenizer_error_ret(tok);
426-
PyErr_Format(PyExc_SyntaxError,
427-
"encoding problem: %s with BOM", cs);
427+
tok->line_start = line;
428+
tok->cur = (char *)line;
429+
_PyTokenizer_syntaxerror_known_range(tok, 0, size,
430+
"encoding problem: %s with BOM", cs);
428431
PyMem_Free(cs);
432+
_PyTokenizer_error_ret(tok);
429433
return 0;
430434
}
431435
PyMem_Free(cs);
@@ -498,25 +502,36 @@ valid_utf8(const unsigned char* s)
498502
int
499503
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
500504
{
501-
int badchar = 0;
502-
const unsigned char *c;
505+
const char *badchar = NULL;
506+
const char *c;
503507
int length;
504-
for (c = (const unsigned char *)line; *c; c += length) {
505-
if (!(length = valid_utf8(c))) {
506-
badchar = *c;
508+
int col_offset = 0;
509+
const char *line_start = line;
510+
for (c = line; *c; c += length) {
511+
if (!(length = valid_utf8((const unsigned char *)c))) {
512+
badchar = c;
507513
break;
508514
}
515+
col_offset++;
509516
if (*c == '\n') {
510517
lineno++;
518+
col_offset = 0;
519+
line_start = c + 1;
511520
}
512521
}
513522
if (badchar) {
514-
PyErr_Format(PyExc_SyntaxError,
515-
"Non-UTF-8 code starting with '\\x%.2x' "
516-
"in file %V on line %i, "
517-
"but no encoding declared; "
518-
"see https://peps.python.org/pep-0263/ for details",
519-
badchar, tok->filename, "<string>", lineno);
523+
tok->lineno = lineno;
524+
tok->line_start = line_start;
525+
tok->cur = (char *)badchar;
526+
_PyTokenizer_syntaxerror_known_range(tok,
527+
col_offset + 1, col_offset + 1,
528+
"Non-UTF-8 code starting with '\\x%.2x'"
529+
"%s%V on line %i, "
530+
"but no encoding declared; "
531+
"see https://peps.python.org/pep-0263/ for details",
532+
(unsigned char)*badchar,
533+
tok->filename ? " in file " : "", tok->filename, "",
534+
lineno);
520535
return 0;
521536
}
522537
return 1;

Parser/tokenizer/string_tokenizer.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
8686
/* need to check line 1 and 2 separately since check_coding_spec
8787
assumes a single line as input */
8888
if (newl[0]) {
89+
tok->lineno = 1;
8990
if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
9091
return NULL;
9192
}
9293
if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
94+
tok->lineno = 2;
9395
if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
9496
tok, buf_setreadl))
9597
return NULL;
9698
}
9799
}
100+
tok->lineno = 0;
98101
if (tok->enc != NULL) {
99102
assert(utf8 == NULL);
100103
utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);

0 commit comments

Comments
 (0)