Incomplete tokenizer input mode

pablogsal · pablogsal · commit bcffc3555b0a · 2025-05-06T16:31:54.000+01:00
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h
@@ -500,6 +500,7 @@ struct _Py_global_strings {
         STRUCT_FOR_ID(importlib)
         STRUCT_FOR_ID(in_fd)
         STRUCT_FOR_ID(incoming)
+        STRUCT_FOR_ID(incomplete_input)
         STRUCT_FOR_ID(index)
         STRUCT_FOR_ID(indexgroup)
         STRUCT_FOR_ID(inf)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h
diff --git a/Lib/_pyrepl/utils.py b/Lib/_pyrepl/utils.py
@@ -96,7 +96,7 @@ def gen_colors(buffer: str) -> Iterator[ColorSpan]:
         line_lengths[i] += line_lengths[i-1]
 
     sio.seek(0)
-    gen = tokenize.generate_tokens(sio.readline)
+    gen = tokenize._generate_tokens_from_c_tokenizer(sio.readline, extra_tokens=True, incomplete_input=True)
     last_emitted: ColorSpan | None = None
     try:
         for color in gen_colors_from_token_stream(gen, line_lengths):
@@ -106,30 +106,29 @@ def gen_colors(buffer: str) -> Iterator[ColorSpan]:
         yield from recover_unterminated_string(
             te, line_lengths, last_emitted, buffer
         )
+    except _IncompleteInputError as te:
+        yield from recover_unterminated_string(
+            te, line_lengths, last_emitted, buffer, incomplete_input=True
+        )
 
 
 def recover_unterminated_string(
     exc: tokenize.TokenError,
     line_lengths: list[int],
     last_emitted: ColorSpan | None,
     buffer: str,
+    incomplete_input: bool = False,
 ) -> Iterator[ColorSpan]:
     msg, loc = exc.args
     if loc is None:
         return
 
-    line_no, column = loc
+    if incomplete_input:
+        _, line_no, column, *_ = loc
+    else:
+        line_no, column = loc
 
-    if msg.startswith(
-        (
-            "unterminated string literal",
-            "unterminated f-string literal",
-            "unterminated t-string literal",
-            "EOF in multi-line string",
-            "unterminated triple-quoted f-string literal",
-            "unterminated triple-quoted t-string literal",
-        )
-    ):
+    if incomplete_input:
         start = line_lengths[line_no - 1] + column - 1
         end = line_lengths[-1] - 1
 
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -572,12 +572,12 @@ def _transform_msg(msg):
         return "EOF in multi-line string"
     return msg
 
-def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
+def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False, incomplete_input=False):
     """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
     if encoding is None:
-        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
+        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens, incomplete_input=incomplete_input)
     else:
-        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
+        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens, incomplete_input=incomplete_input)
     try:
         for info in it:
             yield TokenInfo._make(info)
@@ -586,6 +586,8 @@ def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False)
             raise e from None
         msg = _transform_msg(e.msg)
         raise TokenError(msg, (e.lineno, e.offset)) from None
+    except _IncompleteInputError:
+        raise
 
 
 if __name__ == "__main__":
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
@@ -1142,7 +1142,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                 }
 
                 if (quote_size == 3) {
-                    _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
+                    _PyTokenizer_unterminated_syntaxerror(tok, "unterminated triple-quoted string literal"
                                      " (detected at line %d)", start);
                     if (c != '\n') {
                         tok->done = E_EOFS;
@@ -1151,14 +1151,14 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                 }
                 else {
                     if (has_escaped_quote) {
-                        _PyTokenizer_syntaxerror(
+                        _PyTokenizer_unterminated_syntaxerror(
                             tok,
                             "unterminated string literal (detected at line %d); "
                             "perhaps you escaped the end quote?",
                             start
                         );
                     } else {
-                        _PyTokenizer_syntaxerror(
+                        _PyTokenizer_unterminated_syntaxerror(
                             tok, "unterminated string literal (detected at line %d)", start
                         );
                     }
@@ -1446,7 +1446,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             tok->lineno = the_current_tok->first_line;
 
             if (current_tok->quote_size == 3) {
-                _PyTokenizer_syntaxerror(tok,
+                _PyTokenizer_unterminated_syntaxerror(tok,
                                     "unterminated triple-quoted %c-string literal"
                                     " (detected at line %d)",
                                     TOK_GET_STRING_PREFIX(tok), start);
@@ -1456,7 +1456,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
                 return MAKE_TOKEN(ERRORTOKEN);
             }
             else {
-                return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
+                return MAKE_TOKEN(_PyTokenizer_unterminated_syntaxerror(tok,
                                     "unterminated %c-string literal (detected at"
                                     " line %d)", TOK_GET_STRING_PREFIX(tok), start));
             }
diff --git a/Parser/lexer/state.c b/Parser/lexer/state.c
@@ -56,6 +56,7 @@ _PyTokenizer_tok_new(void)
     tok->implicit_newline = 0;
     tok->tok_mode_stack[0] = (tokenizer_mode){.kind =TOK_REGULAR_MODE, .quote='\0', .quote_size = 0, .in_debug=0};
     tok->tok_mode_stack_index = 0;
+    tok->incomplete_input = 0;
 #ifdef Py_DEBUG
     tok->debug = _Py_GetConfig()->parser_debug;
 #endif
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
@@ -131,6 +131,7 @@ struct tok_state {
     int tok_extra_tokens;
     int comment_newline;
     int implicit_newline;
+    int incomplete_input;
 #ifdef Py_DEBUG
     int debug;
 #endif
diff --git a/Parser/tokenizer/helpers.c b/Parser/tokenizer/helpers.c
@@ -1,16 +1,16 @@
 #include "Python.h"
 #include "errcode.h"
 #include "pycore_token.h"
+#include "pycore_pyerrors.h"
 
 #include "../lexer/state.h"
 
 
 /* ############## ERRORS ############## */
 
 static int
-_syntaxerror_range(struct tok_state *tok, const char *format,
-                   int col_offset, int end_col_offset,
-                   va_list vargs)
+_syntaxerror_range(struct tok_state *tok, int raise_incomplete_input, const char *format,
+                   int col_offset, int end_col_offset, va_list vargs)
 {
     // In release builds, we don't want to overwrite a previous error, but in debug builds we
     // want to fail if we are not doing it so we can fix it.
@@ -50,7 +50,11 @@ _syntaxerror_range(struct tok_state *tok, const char *format,
     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
                          col_offset, errtext, tok->lineno, end_col_offset);
     if (args) {
-        PyErr_SetObject(PyExc_SyntaxError, args);
+        if (tok->incomplete_input && raise_incomplete_input) {
+            PyErr_SetObject(PyExc_IncompleteInputError, args);
+        } else {
+            PyErr_SetObject(PyExc_SyntaxError, args);
+        }
         Py_DECREF(args);
     }
 
@@ -66,7 +70,18 @@ _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...)
     // This errors are cleaned on startup. Todo: Fix it.
     va_list vargs;
     va_start(vargs, format);
-    int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
+    int ret = _syntaxerror_range(tok, 0, format, -1, -1, vargs);
+    va_end(vargs);
+    return ret;
+}
+
+int
+_PyTokenizer_unterminated_syntaxerror(struct tok_state *tok, const char *format, ...)
+{
+    // This errors are cleaned on startup. Todo: Fix it.
+    va_list vargs;
+    va_start(vargs, format);
+    int ret = _syntaxerror_range(tok, 1, format, -1, -1, vargs);
     va_end(vargs);
     return ret;
 }
@@ -78,7 +93,7 @@ _PyTokenizer_syntaxerror_known_range(struct tok_state *tok,
 {
     va_list vargs;
     va_start(vargs, format);
-    int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
+    int ret = _syntaxerror_range(tok, 0, format, col_offset, end_col_offset, vargs);
     va_end(vargs);
     return ret;
 }
diff --git a/Parser/tokenizer/helpers.h b/Parser/tokenizer/helpers.h
@@ -10,6 +10,7 @@
             tok->col_offset = 0;
 
 int _PyTokenizer_syntaxerror(struct tok_state *tok, const char *format, ...);
+int _PyTokenizer_unterminated_syntaxerror(struct tok_state *tok, const char *format, ...);
 int _PyTokenizer_syntaxerror_known_range(struct tok_state *tok, int col_offset, int end_col_offset, const char *format, ...);
 int _PyTokenizer_indenterror(struct tok_state *tok);
 int _PyTokenizer_warn_invalid_escape_sequence(struct tok_state *tok, int first_invalid_escape_char);
diff --git a/Python/Python-tokenize.c b/Python/Python-tokenize.c
@@ -50,12 +50,14 @@ _tokenizer.tokenizeriter.__new__ as tokenizeriter_new
     *
     extra_tokens: bool
     encoding: str(c_default="NULL") = 'utf-8'
+    incomplete_input: bool = False
 [clinic start generated code]*/
 
 static PyObject *
 tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
-                       int extra_tokens, const char *encoding)
-/*[clinic end generated code: output=7501a1211683ce16 input=f7dddf8a613ae8bd]*/
+                       int extra_tokens, const char *encoding,
+                       int incomplete_input)
+/*[clinic end generated code: output=e72ebb8037a120b7 input=c5e19a57c7bebeb6]*/
 {
     tokenizeriterobject *self = (tokenizeriterobject *)type->tp_alloc(type, 0);
     if (self == NULL) {
@@ -75,6 +77,7 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
         self->tok->tok_extra_tokens = 1;
     }
     self->done = 0;
+    self->tok->incomplete_input = incomplete_input;
 
     self->last_line = NULL;
     self->byte_col_offset_diff = 0;
diff --git a/Python/clinic/Python-tokenize.c.h b/Python/clinic/Python-tokenize.c.h