Skip to content

Commit 7e9910e

Browse files
gh-63161: Fix PEP 263 support
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified. * Detect decoding error in comments for UTF-8 encoding.
1 parent 0158890 commit 7e9910e

File tree

7 files changed

+113
-22
lines changed

7 files changed

+113
-22
lines changed

Lib/test/test_source_encoding.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,17 @@ def test_second_non_utf8_coding_line(self):
267267
b'print(ascii("\xc3\xa4"))\n')
268268
self.check_script_output(src, br"'\xc3\u20ac'")
269269

270+
def test_first_utf8_coding_line_error(self):
271+
src = (b'#coding:ascii \xc3\xa4\n'
272+
b'raise RuntimeError\n')
273+
self.check_script_error(src, br"'ascii' codec can't decode byte")
274+
275+
def test_second_utf8_coding_line_error(self):
276+
src = (b'#!/usr/bin/python\n'
277+
b'#coding:ascii \xc3\xa4\n'
278+
b'raise RuntimeError\n')
279+
self.check_script_error(src, br"'ascii' codec can't decode byte")
280+
270281
def test_utf8_bom(self):
271282
src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
272283
self.check_script_output(src, br"'\xe4'")
@@ -282,7 +293,57 @@ def test_utf8_bom_and_utf8_coding_line(self):
282293
b'print(ascii("\xc3\xa4"))\n')
283294
self.check_script_output(src, br"'\xe4'")
284295

285-
def test_utf8_non_utf8_comment_line_error(self):
296+
def test_non_utf8_shebang(self):
297+
src = (b'#!/home/\xa4/bin/python\n'
298+
b'#coding:iso-8859-15\n'
299+
b'print(ascii("\xc3\xa4"))\n')
300+
self.check_script_output(src, br"'\xc3\u20ac'")
301+
302+
def test_utf8_shebang_error(self):
303+
src = (b'#!/home/\xc3\xa4/bin/python\n'
304+
b'#coding:ascii\n'
305+
b'raise RuntimeError\n')
306+
self.check_script_error(src, br"'ascii' codec can't decode byte")
307+
308+
def test_non_utf8_shebang_error(self):
309+
src = (b'#!/home/\xa4/bin/python\n'
310+
b'raise RuntimeError\n')
311+
self.check_script_error(src, br"Non-UTF-8 code starting with .* on line 1")
312+
313+
def test_non_utf8_second_line_error(self):
314+
src = (b'#\n'
315+
b'#\xa4\n'
316+
b'raise RuntimeError\n')
317+
self.check_script_error(src,
318+
br"Non-UTF-8 code starting with .* on line 2")
319+
320+
def test_non_utf8_third_line_error(self):
321+
src = (b'#\n'
322+
b'#\n'
323+
b'#\xa4\n'
324+
b'raise RuntimeError\n')
325+
self.check_script_error(src,
326+
br"Non-UTF-8 code starting with .* on line 3")
327+
328+
def test_utf8_bom_non_utf8_third_line_error(self):
329+
src = (b'\xef\xbb\xbf#\n'
330+
b'#\n'
331+
b'#\xa4\n'
332+
b'raise RuntimeError\n')
333+
self.check_script_error(src,
334+
br"Non-UTF-8 code starting with .* on line 3|"
335+
br"'utf-8' codec can't decode byte")
336+
337+
def test_utf_8_non_utf8_third_line_error(self):
338+
src = (b'#coding: utf-8\n'
339+
b'#\n'
340+
b'#\xa4\n'
341+
b'raise RuntimeError\n')
342+
self.check_script_error(src,
343+
br"Non-UTF-8 code starting with .* on line 3|"
344+
br"'utf-8' codec can't decode byte")
345+
346+
def test_utf8_non_utf8_third_line_error(self):
286347
src = (b'#coding: utf8\n'
287348
b'#\n'
288349
b'#\xa4\n'
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
2+
encoding is specified. Detect decoding error in comments for default (UTF-8)
3+
encoding.

Parser/tokenizer/file_tokenizer.c

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
282282
}
283283

284284
static int
285-
tok_underflow_file(struct tok_state *tok) {
286-
if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
287-
tok->cur = tok->inp = tok->buf;
288-
}
285+
tok_underflow_file(struct tok_state *tok)
286+
{
289287
if (tok->decoding_state == STATE_INIT) {
290288
/* We have not yet determined the encoding.
291289
If an encoding is found, use the file-pointer
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
296294
}
297295
assert(tok->decoding_state != STATE_INIT);
298296
}
297+
int raw = tok->decoding_readline == NULL;
298+
if (raw && tok->decoding_state != STATE_NORMAL) {
299+
/* Keep the first line in the buffer to validate it later if
300+
* the encoding has not yet been determined. */
301+
}
302+
else if (tok->start == NULL && !INSIDE_FSTRING(tok)) {
303+
tok->cur = tok->inp = tok->buf;
304+
}
299305
/* Read until '\n' or EOF */
300-
if (tok->decoding_readline != NULL) {
306+
if (!raw) {
301307
/* We already have a codec associated with this input. */
302308
if (!tok_readline_recode(tok)) {
303309
return 0;
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
328334

329335
ADVANCE_LINENO();
330336
if (tok->decoding_state != STATE_NORMAL) {
331-
if (tok->lineno > 2) {
332-
tok->decoding_state = STATE_NORMAL;
333-
}
334-
else if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
337+
if (!_PyTokenizer_check_coding_spec(tok->cur, strlen(tok->cur),
335338
tok, fp_setreadl))
336339
{
337340
return 0;
338341
}
342+
if (tok->lineno >= 2) {
343+
tok->decoding_state = STATE_NORMAL;
344+
}
339345
}
340-
/* The default encoding is UTF-8, so make sure we don't have any
341-
non-UTF-8 sequences in it. */
342-
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
343-
_PyTokenizer_error_ret(tok);
344-
return 0;
346+
if (raw && tok->decoding_state == STATE_NORMAL) {
347+
const char *line = tok->lineno <= 2 ? tok->buf : tok->cur;
348+
int lineno = tok->lineno <= 2 ? 1 : tok->lineno;
349+
if (!tok->encoding) {
350+
/* The default encoding is UTF-8, so make sure we don't have any
351+
non-UTF-8 sequences in it. */
352+
if (!_PyTokenizer_ensure_utf8(line, tok, lineno)) {
353+
_PyTokenizer_error_ret(tok);
354+
return 0;
355+
}
356+
}
357+
else {
358+
PyObject *tmp = PyUnicode_Decode(line, strlen(line),
359+
tok->encoding, NULL);
360+
if (tmp == NULL) {
361+
_PyTokenizer_error_ret(tok);
362+
return 0;
363+
}
364+
Py_DECREF(tmp);
365+
}
345366
}
346367
assert(tok->done == E_OK);
347368
return tok->done == E_OK;

Parser/tokenizer/helpers.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -496,24 +496,27 @@ valid_utf8(const unsigned char* s)
496496
}
497497

498498
int
499-
_PyTokenizer_ensure_utf8(char *line, struct tok_state *tok)
499+
_PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno)
500500
{
501501
int badchar = 0;
502-
unsigned char *c;
502+
const unsigned char *c;
503503
int length;
504-
for (c = (unsigned char *)line; *c; c += length) {
504+
for (c = (const unsigned char *)line; *c; c += length) {
505505
if (!(length = valid_utf8(c))) {
506506
badchar = *c;
507507
break;
508508
}
509+
if (*c == '\n') {
510+
lineno++;
511+
}
509512
}
510513
if (badchar) {
511514
PyErr_Format(PyExc_SyntaxError,
512515
"Non-UTF-8 code starting with '\\x%.2x' "
513-
"in file %U on line %i, "
516+
"in file %V on line %i, "
514517
"but no encoding declared; "
515518
"see https://peps.python.org/pep-0263/ for details",
516-
badchar, tok->filename, tok->lineno);
519+
badchar, tok->filename, "<string>", lineno);
517520
return 0;
518521
}
519522
return 1;

Parser/tokenizer/helpers.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ int _PyTokenizer_check_bom(int get_char(struct tok_state *),
2626
struct tok_state *tok);
2727
int _PyTokenizer_check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
2828
int set_readline(struct tok_state *, const char *));
29-
int _PyTokenizer_ensure_utf8(char *line, struct tok_state *tok);
29+
int _PyTokenizer_ensure_utf8(const char *line, struct tok_state *tok, int lineno);
3030

3131
#ifdef Py_DEBUG
3232
void _PyTokenizer_print_escape(FILE *f, const char *s, Py_ssize_t size);

Parser/tokenizer/readline_tokenizer.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ tok_underflow_readline(struct tok_state* tok) {
9797
ADVANCE_LINENO();
9898
/* The default encoding is UTF-8, so make sure we don't have any
9999
non-UTF-8 sequences in it. */
100-
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok)) {
100+
if (!tok->encoding && !_PyTokenizer_ensure_utf8(tok->cur, tok, tok->lineno)) {
101101
_PyTokenizer_error_ret(tok);
102102
return 0;
103103
}

Parser/tokenizer/string_tokenizer.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ decode_str(const char *input, int single, struct tok_state *tok, int preserve_cr
102102
return _PyTokenizer_error_ret(tok);
103103
str = PyBytes_AS_STRING(utf8);
104104
}
105+
else if (!_PyTokenizer_ensure_utf8(str, tok, 1)) {
106+
return _PyTokenizer_error_ret(tok);
107+
}
105108
assert(tok->decoding_buffer == NULL);
106109
tok->decoding_buffer = utf8; /* CAUTION */
107110
return str;

0 commit comments

Comments
 (0)