Commit

StanFromIreland · StanFromIreland · commit 40bcdea78dae · 2025-09-07T13:37:03.000+01:00
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
@@ -261,7 +261,7 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
    Behaviour is expected to be an exact match of `textwrap.dedent`.
    Return a new reference on success, NULL with exception set on error.
    */
-extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
+PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode);
 
 /* --- Misc functions ----------------------------------------------------- */
 
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -1074,6 +1074,95 @@ def test_transform_decimal_and_space(self):
         self.assertRaises(SystemError, transform_decimal, [])
         # CRASHES transform_decimal(NULL)
 
+    @support.cpython_only
+    @unittest.skipIf(_testinternalcapi is None,'need _testinternalcapi module')
+    def test_dedent(self):
+        from _testinternalcapi import _PyUnicode_Dedent as dedent
+        self.assertEqual('hello\nworld', dedent('  hello\n  world'))
+        self.assertEqual('hello\nmy\n  friend', dedent('  hello\n  my\n    friend'))
+
+        # Only spaces.
+        text = "    "
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # Only tabs.
+        text = "\t\t\t\t"
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # A mixture.
+        text = " \t  \t\t  \t "
+        expect = ""
+        self.assertEqual(expect, dedent(text))
+
+        # ASCII whitespace.
+        text = "\f\n\r\t\v "
+        expect = "\n"
+        self.assertEqual(expect, dedent(text))
+
+        # One newline.
+        text = "\n"
+        expect = "\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Windows-style newlines.
+        text = "\r\n" * 5
+        expect = "\n" * 5
+        self.assertEqual(expect, dedent(text))
+
+        # Whitespace mixture.
+        text = "    \n\t\n  \n\t\t\n\n\n       "
+        expect = "\n\n\n\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Lines consisting only of whitespace are always normalised
+        text = "a\n \n\t\n"
+        expect = "a\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Whitespace characters on non-empty lines are retained
+        text = "a\r\n\r\n\r\n"
+        expect = "a\r\n\n\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Uneven indentation with declining indent level.
+        text = "     Foo\n    Bar\n"  # 5 spaces, then 4
+        expect = " Foo\nBar\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Declining indent level with blank line.
+        text = "     Foo\n\n    Bar\n"  # 5 spaces, blank, then 4
+        expect = " Foo\n\nBar\n"
+        self.assertEqual(expect, dedent(text))
+
+        # Declining indent level with whitespace only line.
+        text = "     Foo\n    \n    Bar\n"  # 5 spaces, then 4, then 4
+        expect = " Foo\n\nBar\n"
+        self.assertEqual(expect, dedent(text))
+
+        text = "  hello\tthere\n  how are\tyou?"
+        expect = "hello\tthere\nhow are\tyou?"
+        self.assertEqual(expect, dedent(text))
+
+        # dedent() only removes whitespace that can be uniformly removed!
+        text = "\thello there\n\thow are you?"
+        expect = "hello there\nhow are you?"
+        self.assertEqual(expect, dedent(text))
+
+        text = '''\
+        def foo():
+            while 1:
+                return foo
+        '''
+        expect = '''\
+def foo():
+    while 1:
+        return foo
+'''
+        self.assertEqual(expect, dedent(text))
+
+
     @support.cpython_only
     @unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
     def test_concat(self):
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-09-07-13-36-15.gh-issue-103997.jIPHCc.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-09-07-13-36-15.gh-issue-103997.jIPHCc.rst
@@ -0,0 +1 @@
+:option:`-c` now dedents like :func:`textwrap.dedent`
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
@@ -34,7 +34,7 @@
 #include "pycore_pyerrors.h"      // _PyErr_ChainExceptions1()
 #include "pycore_pylifecycle.h"   // _PyInterpreterConfig_InitFromDict()
 #include "pycore_pystate.h"       // _PyThreadState_GET()
-#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII()
+#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII() / _PyUnicode_Dedent()
 
 #include "clinic/_testinternalcapi.c.h"
 
@@ -1416,6 +1416,17 @@ unicode_transformdecimalandspacetoascii(PyObject *self, PyObject *arg)
     return _PyUnicode_TransformDecimalAndSpaceToASCII(arg);
 }
 
+/* Test _PyUnicode_Dedent() */
+static PyObject *
+unicode_dedent(PyObject *self, PyObject *arg)
+{
+    if (arg == Py_None) {
+        arg = NULL;
+    }
+    return _PyUnicode_Dedent(arg);
+}
+
+
 static PyObject *
 test_pyobject_is_freed(const char *test_name, PyObject *op)
 {
@@ -2422,6 +2433,7 @@ static PyMethodDef module_functions[] = {
     {"_PyTraceMalloc_GetTraceback", tracemalloc_get_traceback, METH_VARARGS},
     {"test_tstate_capi", test_tstate_capi, METH_NOARGS, NULL},
     {"_PyUnicode_TransformDecimalAndSpaceToASCII", unicode_transformdecimalandspacetoascii, METH_O},
+    {"_PyUnicode_Dedent", unicode_dedent, METH_O},
     {"check_pyobject_forbidden_bytes_is_freed",
                             check_pyobject_forbidden_bytes_is_freed, METH_NOARGS},
     {"check_pyobject_freed_is_freed", check_pyobject_freed_is_freed, METH_NOARGS},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -14309,83 +14309,65 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
 }
 
 /*
-This function searchs the longest common leading whitespace
-of all lines in the [src, end).
-It returns the length of the common leading whitespace and sets `output` to
-point to the beginning of the common leading whitespace if length > 0.
+   Find the longest common leading whitespace among a list of lines.
+   Whitespace-only lines are ignored.
+   Returns the margin length (>= 0).
 */
 static Py_ssize_t
-search_longest_common_leading_whitespace(
-    const char *const src,
-    const char *const end,
-    const char **output)
-{
-    // [_start, _start + _len)
-    // describes the current longest common leading whitespace
-    const char *_start = NULL;
-    Py_ssize_t _len = 0;
-
-    for (const char *iter = src; iter < end; ++iter) {
-        const char *line_start = iter;
-        const char *leading_whitespace_end = NULL;
-
-        // scan the whole line
-        while (iter < end && *iter != '\n') {
-            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
-                /* `iter` points to the first non-whitespace character
-                   in this line */
-                if (iter == line_start) {
-                    // some line has no indent, fast exit!
-                    return 0;
-                }
-                leading_whitespace_end = iter;
-            }
-            ++iter;
-        }
+search_longest_common_leading_whitespace(PyObject *lines, Py_ssize_t nlines)
+{
+    PyObject *smallest = NULL, *largest = NULL;
+    for (Py_ssize_t i = 0; i < nlines; i++) {
+        PyObject *line = PyList_GET_ITEM(lines, i);
+        Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
 
-        // if this line has all white space, skip it
-        if (!leading_whitespace_end) {
+        if (linelen == 0) {
             continue;
         }
 
-        if (!_start) {
-            // update the first leading whitespace
-            _start = line_start;
-            _len = leading_whitespace_end - line_start;
-            assert(_len > 0);
+        int kind = PyUnicode_KIND(line);
+        void *data = PyUnicode_DATA(line);
+        int all_ws = 1;
+        for (Py_ssize_t j = 0; j < linelen; j++) {
+            if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
+                all_ws = 0;
+                break;
+            }
+        }
+        if (all_ws) {
+            continue;
         }
-        else {
-            /* We then compare with the current longest leading whitespace.
 
-               [line_start, leading_whitespace_end) is the leading
-               whitespace of this line,
+        if (smallest == NULL || PyObject_RichCompareBool(line, smallest, Py_LT)) {
+            smallest = line;
+        }
+        if (largest == NULL || PyObject_RichCompareBool(line, largest, Py_GT)) {
+            largest = line;
+        }
+    }
 
-               [_start, _start + _len) is the leading whitespace of the
-               current longest leading whitespace. */
-            Py_ssize_t new_len = 0;
-            const char *_iter = _start, *line_iter = line_start;
+    if (smallest == NULL || largest == NULL) {
+        return 0;
+    }
 
-            while (_iter < _start + _len && line_iter < leading_whitespace_end
-                   && *_iter == *line_iter)
-            {
-                ++_iter;
-                ++line_iter;
-                ++new_len;
-            }
+    Py_ssize_t margin = 0;
+    Py_ssize_t minlen = Py_MIN(PyUnicode_GET_LENGTH(smallest),
+                               PyUnicode_GET_LENGTH(largest));
+    int skind = PyUnicode_KIND(smallest);
+    int lkind = PyUnicode_KIND(largest);
+    const void *sdata = PyUnicode_DATA(smallest);
+    const void *ldata = PyUnicode_DATA(largest);
 
-            _len = new_len;
-            if (_len == 0) {
-                // No common things now, fast exit!
-                return 0;
-            }
+    while (margin < minlen) {
+        Py_UCS4 c1 = PyUnicode_READ(skind, sdata, margin);
+        Py_UCS4 c2 = PyUnicode_READ(lkind, ldata, margin);
+        if (c1 != c2 || !(c1 == ' ' || c1 == '\t')) {
+            break;
         }
+        margin++;
     }
 
-    assert(_len >= 0);
-    if (_len > 0) {
-        *output = _start;
-    }
-    return _len;
+    return margin;
 }
 
 /* Dedent a string.
@@ -14395,74 +14377,58 @@ search_longest_common_leading_whitespace(
 PyObject *
 _PyUnicode_Dedent(PyObject *unicode)
 {
-    Py_ssize_t src_len = 0;
-    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
-    if (!src) {
+    PyObject *sep = PyUnicode_FromString("\n");
+    if (sep == NULL) {
         return NULL;
     }
-    assert(src_len >= 0);
-    if (src_len == 0) {
-        return Py_NewRef(unicode);
-    }
-
-    const char *const end = src + src_len;
-
-    // [whitespace_start, whitespace_start + whitespace_len)
-    // describes the current longest common leading whitespace
-    const char *whitespace_start = NULL;
-    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
-        src, end, &whitespace_start);
-
-    if (whitespace_len == 0) {
-        return Py_NewRef(unicode);
+    PyObject *lines = PyUnicode_Split(unicode, sep, -1);
+    Py_DECREF(sep);
+    if (lines == NULL) {
+        return NULL;
     }
+    Py_ssize_t nlines = PyList_GET_SIZE(lines);
+    Py_ssize_t margin = search_longest_common_leading_whitespace(lines, nlines);
 
-    // now we should trigger a dedent
-    char *dest = PyMem_Malloc(src_len);
-    if (!dest) {
-        PyErr_NoMemory();
+    PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
+    if (writer == NULL) {
+        Py_DECREF(lines);
         return NULL;
     }
-    char *dest_iter = dest;
 
-    for (const char *iter = src; iter < end; ++iter) {
-        const char *line_start = iter;
-        bool in_leading_space = true;
+    for (Py_ssize_t i = 0; i < nlines; i++) {
+        PyObject *line = PyList_GET_ITEM(lines, i);
+        Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
 
-        // iterate over a line to find the end of a line
-        while (iter < end && *iter != '\n') {
-            if (in_leading_space && *iter != ' ' && *iter != '\t') {
-                in_leading_space = false;
+        int all_ws = 1;
+        int kind = PyUnicode_KIND(line);
+        void *data = PyUnicode_DATA(line);
+        for (Py_ssize_t j = 0; j < linelen; j++) {
+            if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
+                all_ws = 0;
+                break;
             }
-            ++iter;
         }
 
-        // invariant: *iter == '\n' or iter == end
-        bool append_newline = iter < end;
-
-        // if this line has all white space, write '\n' and continue
-        if (in_leading_space && append_newline) {
-            *dest_iter++ = '\n';
-            continue;
+        if (!all_ws) {
+            Py_ssize_t start = Py_MIN(margin, linelen);
+            if (PyUnicodeWriter_WriteSubstring(writer, line, start, linelen) < 0) {
+                PyUnicodeWriter_Discard(writer);
+                Py_DECREF(lines);
+                return NULL;
+            }
         }
 
-        /* copy [new_line_start + whitespace_len, iter) to buffer, then
-            conditionally append '\n' */
-
-        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
-        assert(new_line_len >= 0);
-        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
-
-        dest_iter += new_line_len;
-
-        if (append_newline) {
-            *dest_iter++ = '\n';
+        if (i < nlines - 1) {
+            if (PyUnicodeWriter_WriteChar(writer, '\n') < 0) {
+                PyUnicodeWriter_Discard(writer);
+                Py_DECREF(lines);
+                return NULL;
+            }
         }
     }
 
-    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
-    PyMem_Free(dest);
-    return res;
+    Py_DECREF(lines);
+    return PyUnicodeWriter_Finish(writer);
 }
 
 static PyMethodDef unicode_methods[] = {

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+:option:`-c` now dedents like :func:`textwrap.dedent`