first implementation of d-string

methane · methane · commit 4d1a44004981 · 2026-01-04T23:57:51.000+09:00
diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py
@@ -0,0 +1,47 @@
+import unittest
+
+
+class DStringTestCase(unittest.TestCase):
+    def assertAllRaise(self, exception_type, regex, error_strings):
+        for str in error_strings:
+            with self.subTest(str=str):
+                with self.assertRaisesRegex(exception_type, regex) as cm:
+                    eval(str)
+                # print("Testing expression:", repr(str))
+                # print(repr(cm.exception))
+                # print(repr(cm.exception.text))
+
+    def test_single_quote(self):
+        exprs = [
+            "d'hello'",
+            'D"hello"',
+            "d'hello\\nworld'",
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must be triple-quoted", exprs)
+
+    def test_empty_dstring(self):
+        exprs = [
+            "d''''''",
+            'D""""""',
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
+
+    def test_no_last_newline(self):
+        exprs = [
+            "d'''\nhello world'''",
+            'D"""\nhello world"""',
+            "df'''\nhello {42}'''",
+        ]
+        self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
+
+    def test_simple_dstring(self):
+        self.assertEqual(eval('d"""\n  hello world\n  """'), "hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\n """'), " hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\n"""'), "  hello world\n")
+        self.assertEqual(eval('d"""\n  hello world\\\n """'), " hello world")
+        self.assertEqual(eval('dr"""\n  hello world\\\n """'), " hello world\\\n")
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
@@ -86,7 +86,8 @@ def _all_string_prefixes():
     # The valid string prefixes. Only contain the lower case versions,
     #  and don't contain any permutations (include 'fr', but not
     #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'br', 'fr', 'tr']
+    _valid_string_prefixes = ['b', 'r', 'u', 'f', 't', 'd', 'br', 'fr', 'tr',
+                              'df', 'dt', 'dr', 'dfr', 'dtr']
     # if we add binary f-strings, add: ['fb', 'fbr']
     result = {''}
     for prefix in _valid_string_prefixes:
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -1292,24 +1292,124 @@ _PyPegen_nonparen_genexp_in_call(Parser *p, expr_ty args, asdl_comprehension_seq
 
 // Fstring stuff
 
+static int
+unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start, const char *line_end,
+                         int is_raw, Token* token)
+{
+    if (is_raw || memchr(line_start, '\\', line_end - line_start) == NULL) {
+        return PyUnicodeWriter_WriteUTF8(w, line_start, line_end - line_start);
+    }
+    else {
+        PyObject *line = _PyPegen_decode_string(p, 1, line_start, line_end - line_start, token);
+        if (line == NULL || PyUnicodeWriter_WriteStr(w, line) < 0) {
+            Py_XDECREF(line);
+            return -1;
+        }
+        Py_DECREF(line);
+    }
+    return 0;
+}
+
+static PyObject*
+_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
+                            int is_raw, int is_first, expr_ty constant, Token* token)
+{
+    Py_ssize_t lineno = constant->lineno;
+    const char *line_start = s;
+    const char *s_end = s + len;
+
+    PyUnicodeWriter *w = PyUnicodeWriter_Create(len);
+    if (w == NULL) {
+        return NULL;
+    }
+    if (is_first) {
+        assert (line_start[0] == '\n');
+        line_start++;  // skip the first newline
+    }
+    else {
+        // Example: df"""
+        //      first part {param} second part
+        //      next line
+        //    """"
+        // We don't need to dedent the first line in the non-first parts.
+        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        if (line_end) {
+            line_end++; // include the newline
+        }
+        else {
+            line_end = s_end;
+        }
+        if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
+            PyUnicodeWriter_Discard(w);
+            return NULL;
+        }
+        line_start = line_end;
+    }
+
+    while (line_start < s + len) {
+        lineno++;
+
+        Py_ssize_t i = 0;
+        while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
+            i++;
+        }
+
+        if (line_start[i] == '\0') {  // found an empty line without newline.
+            break;
+        }
+        if (line_start[i] == '\n') {  // found an empty line with newline.
+            if (PyUnicodeWriter_WriteChar(w, '\n') < 0) {
+                PyUnicodeWriter_Discard(w);
+                return NULL;
+            }
+            line_start += i+1;
+            continue;
+        }
+        if (i < dedent_count) {  // found an invalid indent.
+            assert(line_start[i] != indent_char);
+            PyUnicodeWriter_Discard(w);
+            RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
+                "d-string line missing valid indentation");
+            return NULL;
+        }
+
+        // found a indented line. let's dedent it.
+        line_start += i;
+        const char *line_end = memchr(line_start, '\n', s_end - line_start);
+        if (line_end) {
+            line_end++; // include the newline
+        }
+        else {
+            line_end = s_end;
+        }
+        if (unicodewriter_write_line(p, w, line_start, line_end, is_raw, token) < 0) {
+            PyUnicodeWriter_Discard(w);
+            return NULL;
+        }
+        line_start = line_end;
+    }
+    return  PyUnicodeWriter_Finish(w);
+}
+
 static expr_ty
-_PyPegen_decode_fstring_part(Parser* p, int is_raw, expr_ty constant, Token* token) {
+_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
     assert(PyUnicode_CheckExact(constant->v.Constant.value));
 
     const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
     if (bstr == NULL) {
         return NULL;
     }
+    is_raw = is_raw || strchr(bstr, '\\') == NULL;
 
-    size_t len;
-    if (strcmp(bstr, "{{") == 0 || strcmp(bstr, "}}") == 0) {
-        len = 1;
-    } else {
-        len = strlen(bstr);
+    PyObject *str = NULL;
+    if (dedent_count > 0) {
+        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
+                                        is_raw, is_first, constant, token);
+    }
+    else {
+        str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
     }
 
-    is_raw = is_raw || strchr(bstr, '\\') == NULL;
-    PyObject *str = _PyPegen_decode_string(p, is_raw, bstr, len, token);
     if (str == NULL) {
         _Pypegen_raise_decode_error(p);
         return NULL;
@@ -1340,12 +1440,74 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         return NULL;
     }
     int is_raw = strpbrk(quote_str, "rR") != NULL;
+    int is_dedent = strpbrk(quote_str, "dD") != NULL;
+    int indent_char = 0;
+    Py_ssize_t indent_count = 0;
 
     asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
     if (seq == NULL) {
         return NULL;
     }
 
+    if (is_dedent) {
+        expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
+        if (first_item->kind != Constant_kind
+                || PyUnicode_ReadChar(first_item->v.Constant.value, 0) != '\n') {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                first_item,
+                "d-string must start with a newline"
+            );
+            return NULL;
+        }
+
+        expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
+        if (last_item->kind != Constant_kind) {
+            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                last_item,
+                "d-string must end with an indent line"
+            );
+            return NULL;
+        }
+
+        Py_ssize_t blen;
+        const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
+        if (bstr == NULL) {
+            return NULL;
+        }
+
+        // memrchr is GNU extension; use manual loop for portability.
+        const char *lastline = bstr + blen;
+        while (bstr < lastline) {
+            if (lastline[-1] == '\n') {
+                break;
+            }
+            lastline--;
+            if (*lastline != ' ' && *lastline != '\t') {
+                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
+                    last_item,
+                    "d-string must end with an indent line"
+                );
+                return NULL;
+            }
+        }
+
+        // checks indent of the last line.
+        indent_count = bstr + blen - lastline;
+        if (indent_count > 0) {
+            indent_char = lastline[0];
+
+            for (Py_ssize_t i = 1; i < indent_count; i++) {
+                if (lastline[i] != indent_char) {
+                    RAISE_ERROR_KNOWN_LOCATION(
+                        p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
+                        "inconsistent use of tabs and spaces in indentation"
+                    );
+                    return NULL;
+                }
+            }
+        }
+    }
+
     Py_ssize_t index = 0;
     for (Py_ssize_t i = 0; i < n_items; i++) {
         expr_ty item = asdl_seq_GET(raw_expressions, i);
@@ -1377,7 +1539,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         }
 
         if (item->kind == Constant_kind) {
-            item = _PyPegen_decode_fstring_part(p, is_raw, item, b);
+            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
             if (item == NULL) {
                 return NULL;
             }
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
@@ -455,7 +455,7 @@ tok_continuation_line(struct tok_state *tok) {
 static int
 maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
                                              int saw_b, int saw_r, int saw_u,
-                                             int saw_f, int saw_t) {
+                                             int saw_f, int saw_t, int saw_d) {
     // Supported: rb, rf, rt (in any order)
     // Unsupported: ub, ur, uf, ut, bf, bt, ft (in any order)
 
@@ -480,13 +480,19 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
     if (saw_u && saw_t) {
         RETURN_SYNTAX_ERROR("u", "t");
     }
+    if (saw_u && saw_d) {
+        RETURN_SYNTAX_ERROR("u", "d");
+    }
 
     if (saw_b && saw_f) {
         RETURN_SYNTAX_ERROR("b", "f");
     }
     if (saw_b && saw_t) {
         RETURN_SYNTAX_ERROR("b", "t");
     }
+    if (saw_b && saw_d) {
+        RETURN_SYNTAX_ERROR("b", "d");
+    }
 
     if (saw_f && saw_t) {
         RETURN_SYNTAX_ERROR("f", "t");
@@ -741,8 +747,8 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     /* Identifier (most frequent token!) */
     nonascii = 0;
     if (is_potential_identifier_start(c)) {
-        /* Process the various legal combinations of b"", r"", u"", and f"". */
-        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0;
+        /* Process the various legal combinations of b"", r"", u"", f"", and d"". */
+        int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0, saw_t = 0, saw_d = 0;
         while (1) {
             if (!saw_b && (c == 'b' || c == 'B')) {
                 saw_b = 1;
@@ -762,14 +768,17 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
             else if (!saw_t && (c == 't' || c == 'T')) {
                 saw_t = 1;
             }
+            else if (!saw_d && (c == 'd' || c == 'D')) {
+                saw_d = 1;
+            }
             else {
                 break;
             }
             c = tok_nextc(tok);
             if (c == '"' || c == '\'') {
                 // Raise error on incompatible string prefixes:
                 int status = maybe_raise_syntax_error_for_string_prefixes(
-                    tok, saw_b, saw_r, saw_u, saw_f, saw_t);
+                    tok, saw_b, saw_r, saw_u, saw_f, saw_t, saw_d);
                 if (status < 0) {
                     return MAKE_TOKEN(ERRORTOKEN);
                 }
@@ -1049,7 +1058,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
     }
 
   f_string_quote:
-    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't')
+    if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r' || Py_TOLOWER(*tok->start) == 't' || Py_TOLOWER(*tok->start) == 'd')
         && (c == '\'' || c == '"'))) {
 
         int quote = c;
@@ -1089,6 +1098,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->kind = TOK_FSTRING_MODE;
         the_current_tok->quote = quote;
         the_current_tok->quote_size = quote_size;
+        the_current_tok->raw = 0;
         the_current_tok->start = tok->start;
         the_current_tok->multi_line_start = tok->line_start;
         the_current_tok->first_line = tok->lineno;
@@ -1101,25 +1111,28 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t
         the_current_tok->in_debug = 0;
 
         enum string_kind_t string_kind = FSTRING;
-        switch (*tok->start) {
-            case 'T':
-            case 't':
-                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
-                string_kind = TSTRING;
-                break;
-            case 'F':
-            case 'f':
-                the_current_tok->raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
-                break;
-            case 'R':
-            case 'r':
-                the_current_tok->raw = 1;
-                if (Py_TOLOWER(*(tok->start + 1)) == 't') {
+        for (const char *p = tok->start; *p != c; p++) {
+            switch (*p) {
+                case 'f':
+                case 'F':
+                    break;
+                case 't':
+                case 'T':
                     string_kind = TSTRING;
-                }
-                break;
-            default:
-                Py_UNREACHABLE();
+                    break;
+                case 'r':
+                case 'R':
+                    the_current_tok->raw = 1;
+                    break;
+                case 'd':
+                case 'D':
+                    if (quote_size != 3) {
+                        return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "d-string must be a multiline string"));
+                    }
+                    break;
+                default:
+                    Py_UNREACHABLE();
+            }
         }
 
         the_current_tok->string_kind = string_kind;
diff --git a/Parser/string_parser.c b/Parser/string_parser.c