Include the decoding error position for default encoding in SyntaxError.

serhiy-storchaka · serhiy-storchaka · commit 3ab168a13c2f · 2025-10-03T17:23:00.000+03:00
diff --git a/Lib/test/test_exceptions.py b/Lib/test/test_exceptions.py
@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
                 if not isinstance(src, str):
                     src = src.decode(encoding, 'replace')
                 line = src.split('\n')[lineno-1]
+                if lineno == 1:
+                    line = line.removeprefix('\ufeff')
                 self.assertIn(line, cm.exception.text)
 
     def test_error_offset_continuation_characters(self):
@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
         check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +', 1, 20)
         check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
               2, 19, encoding='cp1251')
-        check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 10)
+        check(b'Python = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
+        check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +', 4, 12)
+        check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +', 1, 12)
         check('x = "a', 1, 5)
         check('lambda x: x = 2', 1, 1)
         check('f{a + b + c}', 1, 2)
@@ -287,7 +291,7 @@ def baz():
         check("pass\npass\npass\n(1+)\npass\npass\npass", 4, 4)
         check("(1+)", 1, 4)
         check("[interesting\nfoo()\n", 1, 1)
-        check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 0, -1)
+        check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n", 1, 0)
         check("""f'''
             {
             (123_a)
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -293,6 +293,21 @@ def test_utf8_bom_and_utf8_coding_line(self):
                b'print(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xe4'")
 
+    def test_utf8_bom_and_non_utf8_first_coding_line(self):
+        src = (b'\xef\xbb\xbf#coding:iso-8859-15\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"encoding problem: iso-8859-15 with BOM",
+                lineno=1)
+
+    def test_utf8_bom_and_non_utf8_second_coding_line(self):
+        src = (b'\xef\xbb\xbf#first\n'
+               b'#coding:iso-8859-15\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"encoding problem: iso-8859-15 with BOM",
+                lineno=2)
+
     def test_non_utf8_shebang(self):
         src = (b'#!/home/\xa4/bin/python\n'
                b'#coding:iso-8859-15\n'
@@ -308,45 +323,50 @@ def test_utf8_shebang_error(self):
     def test_non_utf8_shebang_error(self):
         src = (b'#!/home/\xa4/bin/python\n'
                b'raise RuntimeError\n')
-        self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")
+        self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1",
+                                lineno=1)
 
     def test_non_utf8_second_line_error(self):
-        src = (b'#\n'
-               b'#\xa4\n'
+        src = (b'#first\n'
+               b'#second\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
-                br"Non-UTF-8 code starting with .* on line 2")
+                br"Non-UTF-8 code starting with .* on line 2",
+                lineno=2)
 
     def test_non_utf8_third_line_error(self):
-        src = (b'#\n'
-               b'#\n'
-               b'#\xa4\n'
+        src = (b'#first\n'
+               b'#second\n'
+               b'#third\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
-                br"Non-UTF-8 code starting with .* on line 3")
+                br"Non-UTF-8 code starting with .* on line 3",
+                lineno=3)
 
     def test_utf8_bom_non_utf8_third_line_error(self):
-        src = (b'\xef\xbb\xbf#\n'
-               b'#\n'
-               b'#\xa4\n'
+        src = (b'\xef\xbb\xbf#first\n'
+               b'#second\n'
+               b'#third\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
                 br"Non-UTF-8 code starting with .* on line 3|"
-                br"'utf-8' codec can't decode byte")
+                br"'utf-8' codec can't decode byte",
+                lineno=3)
 
     def test_utf_8_non_utf8_third_line_error(self):
         src = (b'#coding: utf-8\n'
-               b'#\n'
-               b'#\xa4\n'
+               b'#second\n'
+               b'#third\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
                 br"Non-UTF-8 code starting with .* on line 3|"
-                br"'utf-8' codec can't decode byte")
+                br"'utf-8' codec can't decode byte",
+                lineno=3)
 
     def test_utf8_non_utf8_third_line_error(self):
         src = (b'#coding: utf8\n'
-               b'#\n'
-               b'#\xa4\n'
+               b'#second\n'
+               b'#third\xa4\n'
                b'raise RuntimeError\n')
         self.check_script_error(src,
                 br"'utf-8' codec can't decode byte|"
@@ -461,9 +481,17 @@ def check_script_output(self, src, expected):
         out = stdout.getvalue().encode('latin1')
         self.assertEqual(out.rstrip(), expected)
 
-    def check_script_error(self, src, expected):
-        with self.assertRaisesRegex(SyntaxError, expected.decode()) as cm:
+    def check_script_error(self, src, expected, lineno=...):
+        with self.assertRaises(SyntaxError) as cm:
             exec(src)
+        exc = cm.exception
+        self.assertRegex(str(exc), expected.decode())
+        if lineno is not ...:
+            self.assertEqual(exc.lineno, lineno)
+            line = src.splitlines()[lineno-1].decode(errors='replace')
+            if lineno == 1:
+                line = line.removeprefix('\ufeff')
+            self.assertEqual(line, exc.text)
 
 
 class FileSourceEncodingTest(AbstractSourceEncodingTest, unittest.TestCase):
@@ -476,13 +504,21 @@ def check_script_output(self, src, expected):
             res = script_helper.assert_python_ok(fn)
         self.assertEqual(res.out.rstrip(), expected)
 
-    def check_script_error(self, src, expected):
+    def check_script_error(self, src, expected, lineno=...):
         with tempfile.TemporaryDirectory() as tmpd:
             fn = os.path.join(tmpd, 'test.py')
             with open(fn, 'wb') as fp:
                 fp.write(src)
             res = script_helper.assert_python_failure(fn)
-        self.assertRegex(res.err.rstrip().splitlines()[-1], b'SyntaxError.*?' + expected)
+        err = res.err.rstrip()
+        self.assertRegex(err.splitlines()[-1], b'SyntaxError.*?' + expected)
+        if lineno is not ...:
+            self.assertIn(f', line {lineno}\n'.encode(), err)
+            line = src.splitlines()[lineno-1].decode(errors='replace')
+            if lineno == 1:
+                line = line.removeprefix('\ufeff')
+            self.assertIn(line.encode(), err)
+
 
 
 if __name__ == "__main__":
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst
@@ -1,3 +1,5 @@
 Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
 encoding is specified. Detect decoding error in comments for default (UTF-8)
-encoding.
+encoding. Show the line and position of decoding error for default encoding
+in a traceback. Show the line containing the coding cookie when it conflicts
+with the BOM in a traceback.
diff --git a/Parser/pegen_errors.c b/Parser/pegen_errors.c
@@ -2,6 +2,7 @@
 #include <errcode.h>
 
 #include "pycore_pyerrors.h"      // _PyErr_ProgramDecodedTextObject()
+#include "pycore_runtime.h"       // _Py_ID()
 #include "lexer/state.h"
 #include "lexer/lexer.h"
 #include "pegen.h"
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
     PyObject *value;
     PyObject *tback;
     PyErr_Fetch(&type, &value, &tback);
+    if (PyErr_GivenExceptionMatches(value, PyExc_SyntaxError)) {
+        if (PyObject_SetAttr(value, &_Py_ID(filename), filename)) {
+            goto error;
+        }
+        PyErr_Restore(type, value, tback);
+        return;
+    }
     errstr = PyObject_Str(value);
     if (!errstr) {
         goto error;
diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c
@@ -47,8 +47,10 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
         goto error;
     }
 
-    args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
-                         col_offset, errtext, tok->lineno, end_col_offset);
+    args = Py_BuildValue("(O(OiiNii))", errmsg,
+                         tok->filename ? tok->filename : Py_None,
+                         tok->lineno, col_offset, errtext,
+                         tok->lineno, end_col_offset);
     if (args) {
         PyErr_SetObject(PyExc_SyntaxError, args);
         Py_DECREF(args);
@@ -422,10 +424,12 @@ _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_sta
         tok->encoding = cs;
     } else {                /* then, compare cs with BOM */
         if (strcmp(tok->encoding, cs) != 0) {
-            _PyTokenizer_error_ret(tok);
-            PyErr_Format(PyExc_SyntaxError,
-                         "encoding problem: %s with BOM", cs);
+            tok->line_start = line;
+            tok->cur = (char *)line;
+            _PyTokenizer_syntaxerror_known_range(tok, 0, size,
+                        "encoding problem: %s with BOM", cs);
             PyMem_Free(cs);
+            _PyTokenizer_error_ret(tok);
             return 0;
         }
         PyMem_Free(cs);
@@ -498,25 +502,36 @@ valid_utf8(const unsigned char* s)
 int
 _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
 {
-    int badchar = 0;
-    const unsigned char *c;
+    const char *badchar = NULL;
+    const char *c;
     int length;
-    for (c = (const unsigned char *)line; *c; c += length) {
-        if (!(length = valid_utf8(c))) {
-            badchar = *c;
+    int col_offset = 0;
+    const char *line_start = line;
+    for (c = line; *c; c += length) {
+        if (!(length = valid_utf8((const unsigned char *)c))) {
+            badchar = c;
             break;
         }
+        col_offset++;
         if (*c == '\n') {
             lineno++;
+            col_offset = 0;
+            line_start = c + 1;
         }
     }
     if (badchar) {
-        PyErr_Format(PyExc_SyntaxError,
-                     "Non-UTF-8 code starting with '\\x%.2x' "
-                     "in file %V on line %i, "
-                     "but no encoding declared; "
-                     "see https://peps.python.org/pep-0263/ for details",
-                     badchar, tok->filename, "<string>", lineno);
+        tok->lineno = lineno;
+        tok->line_start = line_start;
+        tok->cur = (char *)badchar;
+        _PyTokenizer_syntaxerror_known_range(tok,
+                col_offset + 1, col_offset + 1,
+                "Non-UTF-8 code starting with '\\x%.2x'"
+                "%s%V on line %i, "
+                "but no encoding declared; "
+                "see https://peps.python.org/pep-0263/ for details",
+                (unsigned char)*badchar,
+                tok->filename ? " in file " : "", tok->filename, "",
+                lineno);
         return 0;
     }
     return 1;
diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c
@@ -86,15 +86,18 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
     /* need to check line 1 and 2 separately since check_coding_spec
        assumes a single line as input */
     if (newl[0]) {
+        tok->lineno = 1;
         if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
             return NULL;
         }
         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
+            tok->lineno = 2;
             if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                    tok, buf_setreadl))
                 return NULL;
         }
     }
+    tok->lineno = 0;
     if (tok->enc != NULL) {
         assert(utf8 == NULL);
         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);

Original file line number	Diff line number	Diff line change
`@@ -86,15 +86,18 @@ decode_str(const char input, int single, struct tok_state tok, int preserve_cr`
`86`	`86`	`/* need to check line 1 and 2 separately since check_coding_spec`
`87`	`87`	`assumes a single line as input */`
`88`	`88`	`if (newl[0]) {`
	`89`	`+ tok->lineno = 1;`
`89`	`90`	`if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {`
`90`	`91`	`return NULL;`
`91`	`92`	`}`
`92`	`93`	`if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {`
	`94`	`+ tok->lineno = 2;`
`93`	`95`	`if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],`
`94`	`96`	`tok, buf_setreadl))`
`95`	`97`	`return NULL;`
`96`	`98`	`}`
`97`	`99`	`}`
	`100`	`+ tok->lineno = 0;`
`98`	`101`	`if (tok->enc != NULL) {`
`99`	`102`	`assert(utf8 == NULL);`
`100`	`103`	`utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);`