Skip to content

Commit bcffc35

Browse files
committed
Incomplete tokenizer input mode
1 parent 3dfed23 commit bcffc35

File tree

13 files changed

+85
-45
lines changed

13 files changed

+85
-45
lines changed

Include/internal/pycore_global_objects_fini_generated.h

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_global_strings.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ struct _Py_global_strings {
500500
STRUCT_FOR_ID(importlib)
501501
STRUCT_FOR_ID(in_fd)
502502
STRUCT_FOR_ID(incoming)
503+
STRUCT_FOR_ID(incomplete_input)
503504
STRUCT_FOR_ID(index)
504505
STRUCT_FOR_ID(indexgroup)
505506
STRUCT_FOR_ID(inf)

Include/internal/pycore_runtime_init_generated.h

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Include/internal/pycore_unicodeobject_generated.h

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Lib/_pyrepl/utils.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def gen_colors(buffer: str) -> Iterator[ColorSpan]:
9696
line_lengths[i] += line_lengths[i-1]
9797

9898
sio.seek(0)
99-
gen = tokenize.generate_tokens(sio.readline)
99+
gen = tokenize._generate_tokens_from_c_tokenizer(sio.readline, extra_tokens=True, incomplete_input=True)
100100
last_emitted: ColorSpan | None = None
101101
try:
102102
for color in gen_colors_from_token_stream(gen, line_lengths):
@@ -106,30 +106,29 @@ def gen_colors(buffer: str) -> Iterator[ColorSpan]:
106106
yield from recover_unterminated_string(
107107
te, line_lengths, last_emitted, buffer
108108
)
109+
except _IncompleteInputError as te:
110+
yield from recover_unterminated_string(
111+
te, line_lengths, last_emitted, buffer, incomplete_input=True
112+
)
109113

110114

111115
def recover_unterminated_string(
112116
exc: tokenize.TokenError,
113117
line_lengths: list[int],
114118
last_emitted: ColorSpan | None,
115119
buffer: str,
120+
incomplete_input: bool = False,
116121
) -> Iterator[ColorSpan]:
117122
msg, loc = exc.args
118123
if loc is None:
119124
return
120125

121-
line_no, column = loc
126+
if incomplete_input:
127+
_, line_no, column, *_ = loc
128+
else:
129+
line_no, column = loc
122130

123-
if msg.startswith(
124-
(
125-
"unterminated string literal",
126-
"unterminated f-string literal",
127-
"unterminated t-string literal",
128-
"EOF in multi-line string",
129-
"unterminated triple-quoted f-string literal",
130-
"unterminated triple-quoted t-string literal",
131-
)
132-
):
131+
if incomplete_input:
133132
start = line_lengths[line_no - 1] + column - 1
134133
end = line_lengths[-1] - 1
135134

Lib/tokenize.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -572,12 +572,12 @@ def _transform_msg(msg):
572572
return "EOF in multi-line string"
573573
return msg
574574

575-
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
575+
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False, incomplete_input=False):
576576
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
577577
if encoding is None:
578-
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
578+
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens, incomplete_input=incomplete_input)
579579
else:
580-
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
580+
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens, incomplete_input=incomplete_input)
581581
try:
582582
for info in it:
583583
yield TokenInfo._make(info)
@@ -586,6 +586,8 @@ def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False)
586586
raise e from None
587587
msg = _transform_msg(e.msg)
588588
raise TokenError(msg, (e.lineno, e.offset)) from None
589+
except _IncompleteInputError:
590+
raise
589591

590592

591593
if __name__ == "__main__":

Parser/lexer/lexer.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,7 +1142,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
11421142
}
11431143

11441144
if (quote_size == 3) {
1145-
_PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
1145+
_PyTokenizer_unterminated_syntaxerror(tok, "unterminated triple-quoted string literal"
11461146
" (detected at line %d)", start);
11471147
if (c != '\n') {
11481148
tok->done = E_EOFS;
@@ -1151,14 +1151,14 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
11511151
}
11521152
else {
11531153
if (has_escaped_quote) {
1154-
_PyTokenizer_syntaxerror(
1154+
_PyTokenizer_unterminated_syntaxerror(
11551155
tok,
11561156
"unterminated string literal (detected at line %d); "
11571157
"perhaps you escaped the end quote?",
11581158
start
11591159
);
11601160
} else {
1161-
_PyTokenizer_syntaxerror(
1161+
_PyTokenizer_unterminated_syntaxerror(
11621162
tok, "unterminated string literal (detected at line %d)", start
11631163
);
11641164
}
@@ -1446,7 +1446,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
14461446
tok->lineno = the_current_tok->first_line;
14471447

14481448
if (current_tok->quote_size == 3) {
1449-
_PyTokenizer_syntaxerror(tok,
1449+
_PyTokenizer_unterminated_syntaxerror(tok,
14501450
"unterminated triple-quoted %c-string literal"
14511451
" (detected at line %d)",
14521452
TOK_GET_STRING_PREFIX(tok), start);
@@ -1456,7 +1456,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
14561456
return MAKE_TOKEN(ERRORTOKEN);
14571457
}
14581458
else {
1459-
return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1459+
return MAKE_TOKEN(_PyTokenizer_unterminated_syntaxerror(tok,
14601460
"unterminated %c-string literal (detected at"
14611461
" line %d)", TOK_GET_STRING_PREFIX(tok), start));
14621462
}

Parser/lexer/state.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ _PyTokenizer_tok_new(void)
5656
tok->implicit_newline = 0;
5757
tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .quote='\0', .quote_size = 0, .in_debug=0};
5858
tok->tok_mode_stack_index = 0;
59+
tok->incomplete_input = 0;
5960
#ifdef Py_DEBUG
6061
tok->debug = _Py_GetConfig()->parser_debug;
6162
#endif

Parser/lexer/state.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ struct tok_state {
131131
int tok_extra_tokens;
132132
int comment_newline;
133133
int implicit_newline;
134+
int incomplete_input;
134135
#ifdef Py_DEBUG
135136
int debug;
136137
#endif

Parser/tokenizer/helpers.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
#include "Python.h"
22
#include "errcode.h"
33
#include "pycore_token.h"
4+
#include "pycore_pyerrors.h"
45

56
#include "../lexer/state.h"
67

78

89
/* ############## ERRORS ############## */
910

1011
static int
11-
_syntaxerror_range(struct tok_state *tok, const char *format,
12-
int col_offset, int end_col_offset,
13-
va_list vargs)
12+
_syntaxerror_range(struct tok_state *tok, int raise_incomplete_input, const char *format,
13+
int col_offset, int end_col_offset, va_list vargs)
1414
{
1515
// In release builds, we don't want to overwrite a previous error, but in debug builds we
1616
// want to fail if we are not doing it so we can fix it.
@@ -50,7 +50,11 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
5050
args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
5151
col_offset, errtext, tok->lineno, end_col_offset);
5252
if (args) {
53-
PyErr_SetObject(PyExc_SyntaxError, args);
53+
if (tok->incomplete_input && raise_incomplete_input) {
54+
PyErr_SetObject(PyExc_IncompleteInputError, args);
55+
} else {
56+
PyErr_SetObject(PyExc_SyntaxError, args);
57+
}
5458
Py_DECREF(args);
5559
}
5660

@@ -66,7 +70,18 @@ _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...)
6670
// This errors are cleaned on startup. Todo: Fix it.
6771
va_list vargs;
6872
va_start(vargs, format);
69-
int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
73+
int ret = _syntaxerror_range(tok, 0, format, -1, -1, vargs);
74+
va_end(vargs);
75+
return ret;
76+
}
77+
78+
int
79+
_PyTokenizer_unterminated_syntaxerror(struct tok_state *tok, const char *format, ...)
80+
{
81+
// This errors are cleaned on startup. Todo: Fix it.
82+
va_list vargs;
83+
va_start(vargs, format);
84+
int ret = _syntaxerror_range(tok, 1, format, -1, -1, vargs);
7085
va_end(vargs);
7186
return ret;
7287
}
@@ -78,7 +93,7 @@ _PyTokenizer_syntaxerror_known_range(struct tok_state *tok,
7893
{
7994
va_list vargs;
8095
va_start(vargs, format);
81-
int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
96+
int ret = _syntaxerror_range(tok, 0, format, col_offset, end_col_offset, vargs);
8297
va_end(vargs);
8398
return ret;
8499
}

0 commit comments

Comments
 (0)