gh-63161: Fix PEP 263 support

serhiy-storchaka · serhiy-storchaka · commit 7e9910eba54e · 2025-10-01T18:29:05.000+03:00
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.
* Detect decoding error in comments for UTF-8 encoding.
diff --git a/Lib/test/test_source_encoding.py b/Lib/test/test_source_encoding.py
@@ -267,6 +267,17 @@ def test_second_non_utf8_coding_line(self):
                b'print(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xc3\u20ac'")
 
+    def test_first_utf8_coding_line_error(self):
+        src = (b'#coding:ascii \xc3\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
+    def test_second_utf8_coding_line_error(self):
+        src = (b'#!/usr/bin/python\n'
+               b'#coding:ascii \xc3\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
     def test_utf8_bom(self):
         src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xe4'")
@@ -282,7 +293,57 @@ def test_utf8_bom_and_utf8_coding_line(self):
                b'print(ascii("\xc3\xa4"))\n')
         self.check_script_output(src, br"'\xe4'")
 
-    def test_utf8_non_utf8_comment_line_error(self):
+    def test_non_utf8_shebang(self):
+        src = (b'#!/home/\xa4/bin/python\n'
+               b'#coding:iso-8859-15\n'
+               b'print(ascii("\xc3\xa4"))\n')
+        self.check_script_output(src, br"'\xc3\u20ac'")
+
+    def test_utf8_shebang_error(self):
+        src = (b'#!/home/\xc3\xa4/bin/python\n'
+               b'#coding:ascii\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"'ascii' codec can't decode byte")
+
+    def test_non_utf8_shebang_error(self):
+        src = (b'#!/home/\xa4/bin/python\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")
+
+    def test_non_utf8_second_line_error(self):
+        src = (b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 2")
+
+    def test_non_utf8_third_line_error(self):
+        src = (b'#\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3")
+
+    def test_utf8_bom_non_utf8_third_line_error(self):
+        src = (b'\xef\xbb\xbf#\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3|"
+                br"'utf-8' codec can't decode byte")
+
+    def test_utf_8_non_utf8_third_line_error(self):
+        src = (b'#coding: utf-8\n'
+               b'#\n'
+               b'#\xa4\n'
+               b'raise RuntimeError\n')
+        self.check_script_error(src,
+                br"Non-UTF-8 code starting with .* on line 3|"
+                br"'utf-8' codec can't decode byte")
+
+    def test_utf8_non_utf8_third_line_error(self):
         src = (b'#coding: utf8\n'
                b'#\n'
                b'#\xa4\n'
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst
@@ -0,0 +1,3 @@
+Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
+encoding is specified. Detect decoding error in comments for default (UTF-8)
+encoding.
diff --git a/Parser/tokenizer/file_tokenizer.c b/Parser/tokenizer/file_tokenizer.c
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
 }
 
 static int
-tok_underflow_file(struct tok_state *tok) {
-    if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
-        tok->cur = tok->inp = tok->buf;
-    }
+tok_underflow_file(struct tok_state *tok)
+{
     if (tok->decoding_state == STATE_INIT) {
         /* We have not yet determined the encoding.
            If an encoding is found, use the file-pointer
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
         }
         assert(tok->decoding_state != STATE_INIT);
     }
+    int raw = tok->decoding_readline == NULL;
+    if (raw && tok->decoding_state != STATE_NORMAL) {
+        /* Keep the first line in the buffer to validate it later if
+         * the encoding has not yet been determined. */
+    }
+    else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
+        tok->cur = tok->inp = tok->buf;
+    }
     /* Read until '\n' or EOF */
-    if (tok->decoding_readline != NULL) {
+    if (!raw) {
         /* We already have a codec associated with this input. */
         if (!tok_readline_recode(tok)) {
             return 0;
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
 
     ADVANCE_LINENO();
     if (tok->decoding_state != STATE_NORMAL) {
-        if (tok->lineno > 2) {
-            tok->decoding_state = STATE_NORMAL;
-        }
-        else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
+        if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
                                     tok, fp_setreadl))
         {
             return 0;
         }
+        if (tok->lineno >= 2) {
+            tok->decoding_state = STATE_NORMAL;
+        }
     }
-    /* The default encoding is UTF-8, so make sure we don't have any
-       non-UTF-8 sequences in it. */
-    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
-        _PyTokenizer_error_ret(tok);
-        return 0;
+    if (raw && tok->decoding_state == STATE_NORMAL) {
+        const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
+        int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
+        if (!tok->encoding) {
+            /* The default encoding is UTF-8, so make sure we don't have any
+               non-UTF-8 sequences in it. */
+            if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+        }
+        else {
+            PyObject *tmp = PyUnicode_Decode(line, strlen(line),
+                                             tok->encoding, NULL);
+            if (tmp == NULL) {
+                _PyTokenizer_error_ret(tok);
+                return 0;
+            }
+            Py_DECREF(tmp);
+        }
     }
     assert(tok->done == E_OK);
     return tok->done == E_OK;
diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c
@@ -496,24 +496,27 @@ valid_utf8(const unsigned char* s)
 }
 
 int
-_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
+_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
 {
     int badchar = 0;
-    unsigned char *c;
+    const unsigned char *c;
     int length;
-    for (c = (unsigned char *)line; *c; c += length) {
+    for (c = (const unsigned char *)line; *c; c += length) {
         if (!(length = valid_utf8(c))) {
             badchar = *c;
             break;
         }
+        if (*c == '\n') {
+            lineno++;
+        }
     }
     if (badchar) {
         PyErr_Format(PyExc_SyntaxError,
                      "Non-UTF-8 code starting with '\\x%.2x' "
-                     "in file %U on line %i, "
+                     "in file %V on line %i, "
                      "but no encoding declared; "
                      "see https://peps.python.org/pep-0263/ for details",
-                     badchar, tok->filename, tok->lineno);
+                     badchar, tok->filename, "<string>", lineno);
         return 0;
     }
     return 1;
diff --git a/Parser/tokenizer/helpers.h b/Parser/tokenizer/helpers.h
@@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *),
           struct tok_state *tok);
 int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
                   int set_readline(struct tok_state *, const char *));
-int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok);
+int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno);
 
 #ifdef Py_DEBUG
 void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);
diff --git a/Parser/tokenizer/readline_tokenizer.c b/Parser/tokenizer/readline_tokenizer.c
@@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {
     ADVANCE_LINENO();
     /* The default encoding is UTF-8, so make sure we don't have any
        non-UTF-8 sequences in it. */
-    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
+    if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {
         _PyTokenizer_error_ret(tok);
         return 0;
     }
diff --git a/Parser/tokenizer/string_tokenizer.c b/Parser/tokenizer/string_tokenizer.c
@@ -102,6 +102,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
             return _PyTokenizer_error_ret(tok);
         str = PyBytes_AS_STRING(utf8);
     }
+    else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
+        return _PyTokenizer_error_ret(tok);
+    }
     assert(tok->decoding_buffer == NULL);
     tok->decoding_buffer = utf8; /* CAUTION */
     return str;

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Support non-UTF-8 shebang and comments in Python source files if non-UTF-8`
	`2`	`+encoding is specified. Detect decoding error in comments for default (UTF-8)`
	`3`	`+encoding.`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {`
`97`	`97`	`ADVANCE_LINENO();`
`98`	`98`	`/* The default encoding is UTF-8, so make sure we don't have any`
`99`	`99`	`non-UTF-8 sequences in it. */`
`100`		`- if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {`
	`100`	`+ if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {`
`101`	`101`	`_PyTokenizer_error_ret(tok);`
`102`	`102`	`return 0;`
`103`	`103`	`}`