Add TOK_GET_STRING_PREFIX macro and change tstring field to enum

lysnikolaou · lysnikolaou · commit c8a948250ad0 · 2025-04-21T18:46:57.000+02:00
diff --git a/Grammar/python.gram b/Grammar/python.gram
@@ -1225,7 +1225,7 @@ invalid_expression:
         RAISE_SYNTAX_ERROR_KNOWN_LOCATION (a, "expected expression before 'if', but statement is given") }
    | a='lambda' [lambda_params] b=':' &FSTRING_MIDDLE  {
         RAISE_SYNTAX_ERROR_KNOWN_RANGE(a, b, "%c-string: lambda expressions are not allowed without parentheses",
-            TOK_GET_MODE(p->tok)->tstring ? 't' : 'f') }
+            TOK_GET_STRING_PREFIX(p->tok)) }
 
 invalid_named_expression(memo):
     | a=expression ':=' expression {
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -966,7 +966,7 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
         return RAISE_SYNTAX_ERROR_KNOWN_RANGE(
             conv_token, conv,
             "%c-string: conversion type must come right after the exclamanation mark",
-            TOK_GET_MODE(p->tok)->tstring ? 't' : 'f'
+            TOK_GET_STRING_PREFIX(p->tok)
         );
     }
 
@@ -975,7 +975,7 @@ _PyPegen_check_fstring_conversion(Parser *p, Token* conv_token, expr_ty conv)
             !(first == 's' || first == 'r' || first == 'a')) {
         RAISE_SYNTAX_ERROR_KNOWN_LOCATION(conv,
                                             "%c-string: invalid conversion character %R: expected 's', 'r', or 'a'",
-                                            TOK_GET_MODE(p->tok)->tstring ? 't' : 'f',
+                                            TOK_GET_STRING_PREFIX(p->tok),
                                             conv->v.Name.id);
         return NULL;
     }
@@ -1295,7 +1295,7 @@ _PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* tok
 }
 
 static asdl_expr_seq *
-_get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, int tstring)
+_get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
 {
     Py_ssize_t n_items = asdl_seq_LEN(raw_expressions);
     Py_ssize_t total_items = n_items;
@@ -1329,8 +1329,9 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
             asdl_expr_seq *values = item->v.JoinedStr.values;
             if (asdl_seq_LEN(values) != 2) {
                 PyErr_Format(PyExc_SystemError,
-                             tstring ? "unexpected TemplateStr node without debug data in t-string at line %d"
-                                     : "unexpected JoinedStr node without debug data in f-string at line %d",
+                             string_kind == TSTRING
+                             ? "unexpected TemplateStr node without debug data in t-string at line %d"
+                             : "unexpected JoinedStr node without debug data in f-string at line %d",
                              item->lineno);
                 return NULL;
             }
@@ -1340,7 +1341,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
             asdl_seq_SET(seq, index++, first);
 
             expr_ty second = asdl_seq_GET(values, 1);
-            assert((tstring && second->kind == Interpolation_kind) || second->kind == FormattedValue_kind);
+            assert((string_kind == TSTRING && second->kind == Interpolation_kind) || second->kind == FormattedValue_kind);
             asdl_seq_SET(seq, index++, second);
 
             continue;
@@ -1382,7 +1383,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
 expr_ty
 _PyPegen_template_str(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b) {
 
-    asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, 1);
+    asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, TSTRING);
     return _PyAST_TemplateStr(resized_exprs, a->lineno, a->col_offset,
                               b->end_lineno, b->end_col_offset,
                               p->arena);
@@ -1391,7 +1392,7 @@ _PyPegen_template_str(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token
 expr_ty
 _PyPegen_joined_str(Parser *p, Token* a, asdl_expr_seq* raw_expressions, Token*b) {
 
-    asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, 0);
+    asdl_expr_seq *resized_exprs = _get_resized_exprs(p, a, raw_expressions, b, FSTRING);
     return _PyAST_JoinedStr(resized_exprs, a->lineno, a->col_offset,
                             b->end_lineno, b->end_col_offset,
                             p->arena);
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
@@ -38,6 +38,7 @@ static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
 #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
 #endif
 
+#define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f')
 #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end)
 #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
                 _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
@@ -113,7 +114,7 @@ set_ftstring_expr(struct tok_state* tok, struct token *token, char c) {
     assert(c == '}' || c == ':' || c == '!');
     tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
 
-    if (!(tok_mode->f_string_debug || tok_mode->tstring) || token->metadata) {
+    if (!(tok_mode->f_string_debug || tok_mode->string_kind == TSTRING) || token->metadata) {
         return 0;
     }
     PyObject *res = NULL;
@@ -993,12 +994,12 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->in_format_spec = 0;
         the_current_tok->f_string_debug = 0;
 
-        int tstring = 0;
+        enum string_kind_t string_kind = FSTRING;
         switch (*tok->start) {
             case 'T':
             case 't':
                 the_current_tok->f_string_raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
-                tstring = 1;
+                string_kind = TSTRING;
                 break;
             case 'F':
             case 'f':
@@ -1007,16 +1008,18 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             case 'R':
             case 'r':
                 the_current_tok->f_string_raw = 1;
-                tstring = Py_TOLOWER(*(tok->start + 1)) == 't';
+                if (Py_TOLOWER(*(tok->start + 1)) == 't') {
+                    string_kind = TSTRING;
+                }
                 break;
             default:
                 Py_UNREACHABLE();
         }
 
-        the_current_tok->tstring = tstring;
+        the_current_tok->string_kind = TSTRING;
         the_current_tok->curly_bracket_depth = 0;
         the_current_tok->curly_bracket_expr_start_depth = -1;
-        return tstring ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START);
+        return string_kind == TSTRING ? MAKE_TOKEN(TSTRING_START) : MAKE_TOKEN(FSTRING_START);
     }
 
   letter_quote:
@@ -1079,7 +1082,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                     if (the_current_tok->f_string_quote == quote &&
                         the_current_tok->f_string_quote_size == quote_size) {
                         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
-                            "%c-string: expecting '}'", TOK_GET_MODE(tok)->tstring ? 't' : 'f'));
+                            "%c-string: expecting '}'", TOK_GET_STRING_PREFIX(tok)));
                     }
                 }
 
@@ -1209,7 +1212,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     case '}':
         if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
-                "%c-string: single '}' is not allowed", TOK_GET_MODE(tok)->tstring ? 't' : 'f'));
+                "%c-string: single '}' is not allowed", TOK_GET_STRING_PREFIX(tok)));
         }
         if (!tok->tok_extra_tokens && !tok->level) {
             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c));
@@ -1230,7 +1233,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
                     int previous_bracket = current_tok->curly_bracket_depth - 1;
                     if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
                         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
-                            "%c-string: unmatched '%c'", TOK_GET_MODE(tok)->tstring ? 't' : 'f', c));
+                            "%c-string: unmatched '%c'", TOK_GET_STRING_PREFIX(tok), c));
                     }
                 }
                 if (tok->parenlinenostack[tok->level] != tok->lineno) {
@@ -1252,7 +1255,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             current_tok->curly_bracket_depth--;
             if (current_tok->curly_bracket_depth < 0) {
                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "%c-string: unmatched '%c'",
-                    TOK_GET_MODE(tok)->tstring ? 't' : 'f', c));
+                    TOK_GET_STRING_PREFIX(tok), c));
             }
             if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
                 current_tok->curly_bracket_expr_start_depth--;
@@ -1303,7 +1306,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             current_tok->curly_bracket_expr_start_depth++;
             if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
-                    "%c-string: expressions nested too deeply", TOK_GET_MODE(tok)->tstring ? 't' : 'f'));
+                    "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
             }
             TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
             return tok_get_normal_mode(tok, current_tok, token);
@@ -1383,7 +1386,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
                 _PyTokenizer_syntaxerror(tok,
                                     "unterminated triple-quoted %c-string literal"
                                     " (detected at line %d)",
-                                    TOK_GET_MODE(tok)->tstring ? 't' : 'f', start);
+                                    TOK_GET_STRING_PREFIX(tok), start);
                 if (c != '\n') {
                     tok->done = E_EOFS;
                 }
@@ -1392,7 +1395,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
             else {
                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
                                     "unterminated %c-string literal (detected at"
-                                    " line %d)", TOK_GET_MODE(tok)->tstring ? 't' : 'f', start));
+                                    " line %d)", TOK_GET_STRING_PREFIX(tok), start));
             }
         }
 
@@ -1414,7 +1417,7 @@ tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct
                 current_tok->curly_bracket_expr_start_depth++;
                 if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
-                        "%c-string: expressions nested too deeply", TOK_GET_MODE(tok)->tstring ? 't' : 'f'));
+                        "%c-string: expressions nested too deeply", TOK_GET_STRING_PREFIX(tok)));
                 }
                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
                 current_tok->in_format_spec = 0;
diff --git a/Parser/lexer/state.h b/Parser/lexer/state.h
@@ -36,6 +36,11 @@ enum tokenizer_mode_kind_t {
     TOK_FSTRING_MODE,
 };
 
+enum string_kind_t {
+    FSTRING,
+    TSTRING,
+};
+
 #define MAX_EXPR_NESTING 3
 
 typedef struct _tokenizer_mode {
@@ -60,7 +65,7 @@ typedef struct _tokenizer_mode {
     int f_string_debug;
     int in_format_spec;
 
-    int tstring;
+    enum string_kind_t string_kind;
 } tokenizer_mode;
 
 /* Tokenizer state */
diff --git a/Parser/parser.c b/Parser/parser.c
diff --git a/Parser/pegen.h b/Parser/pegen.h
@@ -27,6 +27,7 @@
 #define CURRENT_POS (-5)
 
 #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
+#define TOK_GET_STRING_PREFIX(tok) (TOK_GET_MODE(tok)->string_kind == TSTRING ? 't' : 'f')
 
 typedef struct _memo {
     int type;