use least indent instead of closing quote indent

methane · methane · commit 21cc35aae53c · 2026-01-21T01:29:30.000+09:00
diff --git a/Lib/test/test_dstring.py b/Lib/test/test_dstring.py
@@ -26,14 +26,6 @@ def test_empty_dstring(self):
         ]
         self.assertAllRaise(SyntaxError, "d-string must start with a newline", exprs)
 
-    def test_no_last_newline(self):
-        exprs = [
-            "d'''\nhello world'''",
-            'D"""\nhello world"""',
-            "df'''\nhello {42}'''",
-        ]
-        self.assertAllRaise(SyntaxError, "d-string must end with an indent line", exprs)
-
     def test_simple_dstring(self):
         self.assertEqual(eval('d"""\n  hello world\n  """'), "hello world\n")
         self.assertEqual(eval('d"""\n  hello world\n """'), " hello world\n")
@@ -42,6 +34,5 @@ def test_simple_dstring(self):
         self.assertEqual(eval('dr"""\n  hello world\\\n """'), " hello world\\\n")
 
 
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -13480,8 +13480,8 @@ of all lines in the [src, end).
 It returns the length of the common leading whitespace and sets `output` to
 point to the beginning of the common leading whitespace if length > 0.
 */
-static Py_ssize_t
-search_longest_common_leading_whitespace(
+Py_ssize_t
+_Py_search_longest_common_leading_whitespace(
     const char *const src,
     const char *const end,
     const char **output)
@@ -13576,7 +13576,7 @@ _PyUnicode_Dedent(PyObject *unicode)
     // [whitespace_start, whitespace_start + whitespace_len)
     // describes the current longest common leading whitespace
     const char *whitespace_start = NULL;
-    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
+    Py_ssize_t whitespace_len = _Py_search_longest_common_leading_whitespace(
         src, end, &whitespace_start);
 
     if (whitespace_len == 0) {
diff --git a/Parser/action_helpers.c b/Parser/action_helpers.c
@@ -1311,8 +1311,8 @@ unicodewriter_write_line(Parser *p, PyUnicodeWriter *w, const char *line_start,
 }
 
 static PyObject*
-_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_char, Py_ssize_t dedent_count,
-                            int is_raw, int is_first, expr_ty constant, Token* token)
+_PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, const char *indent, Py_ssize_t indent_len,
+                            int is_first, int is_raw, expr_ty constant, Token* token)
 {
     Py_ssize_t lineno = constant->lineno;
     const char *line_start = s;
@@ -1350,7 +1350,7 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
         lineno++;
 
         Py_ssize_t i = 0;
-        while (line_start + i < s_end && i < dedent_count && line_start[i] == indent_char) {
+        while (line_start + i < s_end && i < indent_len && line_start[i] == indent[i]) {
             i++;
         }
 
@@ -1365,8 +1365,8 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
             line_start += i+1;
             continue;
         }
-        if (i < dedent_count) {  // found an invalid indent.
-            assert(line_start[i] != indent_char);
+        if (i < indent_len) {  // found an invalid indent.
+            assert(line_start[i] != indent[i]);
             PyUnicodeWriter_Discard(w);
             RAISE_ERROR_KNOWN_LOCATION(p, PyExc_SyntaxError, lineno, i, lineno, i+1,
                 "d-string line missing valid indentation");
@@ -1392,7 +1392,10 @@ _PyPegen_dedent_string_part(Parser *p, const char *s, size_t len, int indent_cha
 }
 
 static expr_ty
-_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_char, Py_ssize_t dedent_count, expr_ty constant, Token* token) {
+_PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw,
+                             const char *indent, Py_ssize_t indent_len,
+                             expr_ty constant, Token* token)
+{
     assert(PyUnicode_CheckExact(constant->v.Constant.value));
 
     const char* bstr = PyUnicode_AsUTF8(constant->v.Constant.value);
@@ -1402,9 +1405,9 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
     is_raw = is_raw || strchr(bstr, '\\') == NULL;
 
     PyObject *str = NULL;
-    if (dedent_count > 0) {
-        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent_char, dedent_count,
-                                        is_raw, is_first, constant, token);
+    if (indent_len > 0) {
+        str = _PyPegen_dedent_string_part(p, bstr, strlen(bstr), indent, indent_len,
+                                          is_first, is_raw, constant, token);
     }
     else {
         str = _PyPegen_decode_string(p, is_raw, bstr, strlen(bstr), token);
@@ -1423,6 +1426,14 @@ _PyPegen_decode_fstring_part(Parser* p, int is_first, int is_raw, int indent_cha
                            p->arena);
 }
 
+/* defined in unicodeobject.c */
+extern Py_ssize_t
+_Py_search_longest_common_leading_whitespace(
+    const char *const src,
+    const char *const end,
+    const char **output
+    );
+
 static asdl_expr_seq *
 _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b, enum string_kind_t string_kind)
 {
@@ -1441,14 +1452,15 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
     }
     int is_raw = strpbrk(quote_str, "rR") != NULL;
     int is_dedent = strpbrk(quote_str, "dD") != NULL;
-    int indent_char = 0;
-    Py_ssize_t indent_count = 0;
 
     asdl_expr_seq *seq = _Py_asdl_expr_seq_new(total_items, p->arena);
     if (seq == NULL) {
         return NULL;
     }
 
+    const char *common_indent_start = NULL;
+    Py_ssize_t common_indent_len = 0;
+
     if (is_dedent) {
         expr_ty first_item = asdl_seq_GET(raw_expressions, 0);
         if (first_item->kind != Constant_kind
@@ -1460,52 +1472,52 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
             return NULL;
         }
 
-        expr_ty last_item = asdl_seq_GET(raw_expressions, n_items - 1);
-        if (last_item->kind != Constant_kind) {
-            RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-                last_item,
-                "d-string must end with an indent line"
-            );
+        // Instead of calculating common indent from all parts,
+        // build temporary string and calculate common indent from it.
+        PyBytesWriter *w = PyBytesWriter_Create(0);
+        if (w == NULL) {
             return NULL;
         }
 
-        Py_ssize_t blen;
-        const char *bstr = PyUnicode_AsUTF8AndSize(last_item->v.Constant.value, &blen);
-        if (bstr == NULL) {
-            return NULL;
-        }
+        for (Py_ssize_t i = 0; i < n_items; i++) {
+            expr_ty item = asdl_seq_GET(raw_expressions, i);
 
-        // memrchr is GNU extension; use manual loop for portability.
-        const char *lastline = bstr + blen;
-        while (bstr < lastline) {
-            if (lastline[-1] == '\n') {
-                break;
-            }
-            lastline--;
-            if (*lastline != ' ' && *lastline != '\t') {
-                RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
-                    last_item,
-                    "d-string must end with an indent line"
-                );
-                return NULL;
+            if (item->kind == JoinedStr_kind) {
+                // Write a placeholder.
+                if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
+                    PyBytesWriter_Discard(w);
+                    return NULL;
+                }
+                continue;
             }
-        }
-
-        // checks indent of the last line.
-        indent_count = bstr + blen - lastline;
-        if (indent_count > 0) {
-            indent_char = lastline[0];
-
-            for (Py_ssize_t i = 1; i < indent_count; i++) {
-                if (lastline[i] != indent_char) {
-                    RAISE_ERROR_KNOWN_LOCATION(
-                        p, PyExc_TabError, last_item->end_lineno, i, last_item->end_lineno, i+1,
-                        "inconsistent use of tabs and spaces in indentation"
-                    );
+            if (item->kind == Constant_kind) {
+                Py_ssize_t blen;
+                const char *bstr = PyUnicode_AsUTF8AndSize(item->v.Constant.value, &blen);
+                if (bstr == NULL || PyBytesWriter_WriteBytes(w, bstr, blen) < 0) {
+                    PyBytesWriter_Discard(w);
                     return NULL;
                 }
+                continue;
             }
         }
+        // Add a terminator to include the last line before the ending quote
+        if (PyBytesWriter_WriteBytes(w, "X", 1) < 0) {
+            PyBytesWriter_Discard(w);
+            return NULL;
+        }
+
+        // TODO: instead of creating temp_bytes, we could search
+        // common index from each part directly. But this need reimplementation
+        // of _Py_search_longest_common_leading_whitespace.
+        PyObject *temp_bytes = PyBytesWriter_Finish(w);
+        if (temp_bytes == NULL) {
+            return NULL;
+        }
+        _PyArena_AddPyObject(p->arena, temp_bytes);
+        const char *temp_str = PyBytes_AsString(temp_bytes);
+        const char *temp_end = temp_str + PyBytes_GET_SIZE(temp_bytes);
+        common_indent_len = _Py_search_longest_common_leading_whitespace(
+            temp_str, temp_end, &common_indent_start);
     }
 
     Py_ssize_t index = 0;
@@ -1539,7 +1551,7 @@ _get_resized_exprs(Parser *p, Token *a, asdl_expr_seq *raw_expressions, Token *b
         }
 
         if (item->kind == Constant_kind) {
-            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, indent_char, indent_count, item, b);
+            item = _PyPegen_decode_fstring_part(p, i == 0, is_raw, common_indent_start, common_indent_len, item, b);
             if (item == NULL) {
                 return NULL;
             }
diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c
@@ -490,9 +490,6 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,
     if (saw_b && saw_t) {
         RETURN_SYNTAX_ERROR("b", "t");
     }
-    if (saw_b && saw_d) {
-        RETURN_SYNTAX_ERROR("b", "d");
-    }
 
     if (saw_f && saw_t) {
         RETURN_SYNTAX_ERROR("f", "t");
diff --git a/Parser/string_parser.c b/Parser/string_parser.c

Original file line number	Diff line number	Diff line change
`@@ -490,9 +490,6 @@ maybe_raise_syntax_error_for_string_prefixes(struct tok_state *tok,`
`490`	`490`	`if (saw_b && saw_t) {`
`491`	`491`	`RETURN_SYNTAX_ERROR("b", "t");`
`492`	`492`	`}`
`493`		`- if (saw_b && saw_d) {`
`494`		`- RETURN_SYNTAX_ERROR("b", "d");`
`495`		`- }`
`496`	`493`
`497`	`494`	`if (saw_f && saw_t) {`
`498`	`495`	`RETURN_SYNTAX_ERROR("f", "t");`