Refactor implementation

sunmy2019 · sunmy2019 · commit ed6e17bdd479 · 2025-04-18T11:30:22.000+08:00
diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h
@@ -251,7 +251,7 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
    Behaviour is expected to be an exact match of `textwrap.dedent`.
    Return a new reference on success, NULL with exception set on error.
    */
-PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode);
+extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
 
 /* --- Misc functions ----------------------------------------------------- */
 
diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py
@@ -17,6 +17,8 @@
     spawn_python, kill_python, assert_python_ok, assert_python_failure,
     interpreter_requires_environment
 )
+from textwrap import dedent
+
 
 if not support.has_subprocess_support:
     raise unittest.SkipTest("test module requires subprocess")
@@ -1053,7 +1055,6 @@ def test_int_max_str_digits(self):
 
     def test_cmd_dedent(self):
         # test that -c auto-dedents its arguments
-        from textwrap import dedent
         test_cases = [
             (
                 """
@@ -1096,6 +1097,14 @@ def test_cmd_dedent(self):
                 # textwrap.dedent behavior, but might not be intuitive.
                 "'\\n\\nthis data has an empty newline above and a newline with spaces below \\n\\n'",
             ),
+            (
+                '',
+                '',
+            ),
+            (
+                '  \t\n\t\n \t\t\t  \t\t \t\n\t\t \n\n\n\t\t\t   ',
+                '',
+            ),
         ]
         for code, expected in test_cases:
             # Run the auto-dedent case
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -14270,29 +14270,22 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
     return Py_BuildValue("(N)", copy);
 }
 
-/* Dedent a string.
-   Behaviour is expected to be an exact match of `textwrap.dedent`.
-   Return a new reference on success, NULL with exception set on error.
-   */
-PyAPI_FUNC(PyObject *)
-_PyUnicode_Dedent(PyObject *unicode)
-{
-    Py_ssize_t src_len = 0;
-    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
-    if (!src) {
-        return NULL;
-    }
-    if (src_len <= 0) {
-        Py_INCREF(unicode);
-        return unicode;
-    }
-
-    const char *end = src + src_len;
-
-    // [candidate_start, candidate_start + candidate_len)
+/*
+This function searchs the longest common leading whitespace
+of all lines in the [src, end).
+It returns the length of the common leading whitespace and sets `output` to
+point to the beginning of the common leading whitespace if length > 0.
+*/
+static Py_ssize_t
+search_longest_common_leading_whitespace(
+    const char * const src,
+    const char * const end,
+    const char * * output
+) {
+    // [_start, _start + _len)
     // describes the current longest common leading whitespace
-    const char *candidate_start = NULL;
-    Py_ssize_t candidate_len = 0;
+    const char *_start = NULL;
+    Py_ssize_t _len = 0;
 
     for (const char *iter = src; iter < end; ++iter) {
         const char *line_start = iter;
@@ -14305,8 +14298,7 @@ _PyUnicode_Dedent(PyObject *unicode)
                    in this line */
                 if (iter == line_start) {
                     // some line has no indent, fast exit!
-                    Py_INCREF(unicode);
-                    return unicode;
+                    return 0;
                 }
                 leading_whitespace_end = iter;
             }
@@ -14318,47 +14310,73 @@ _PyUnicode_Dedent(PyObject *unicode)
             continue;
         }
 
-        if (!candidate_start) {
+        if (!_start) {
             // update the first leading whitespace
-            candidate_start = line_start;
-            candidate_len = leading_whitespace_end - line_start;
-            assert(candidate_len > 0);
-        } else {
+            _start = line_start;
+            _len = leading_whitespace_end - line_start;
+            assert(_len > 0);
+        }
+        else {
             /* We then compare with the current longest leading whitespace.
 
-               [line_start, leading_whitespace_end) is the leading whitespace of
-               this line,
+               [line_start, leading_whitespace_end) is the leading
+               whitespace of this line,
 
-               [candidate_start, candidate_start + candidate_len)
-               is the leading whitespace of the current longest leading
-               whitespace. */
-            Py_ssize_t new_candidate_len = 0;
+               [_start, _start + _len) is the leading whitespace of the
+               current longest leading whitespace. */
+            Py_ssize_t new_len = 0;
+            const char *_iter = _start, *line_iter = line_start;
 
-            for (const char *candidate_iter = candidate_start,
-                            *line_iter = line_start;
-                 candidate_iter < candidate_start + candidate_len &&
-                 line_iter < leading_whitespace_end;
-                 ++candidate_iter, ++line_iter) {
-                if (*candidate_iter != *line_iter) {
-                    break;
-                }
-                ++new_candidate_len;
+            while (_iter < _start + _len && line_iter < leading_whitespace_end
+                && *_iter == *line_iter)
+            {
+                ++_iter;
+                ++line_iter;
+                ++new_len;
             }
 
-            candidate_len = new_candidate_len;
-            if (candidate_len == 0) {
+            _len = new_len;
+            if (_len == 0) {
                 // No common things now, fast exit!
-                Py_INCREF(unicode);
-                return unicode;
+                return 0;
             }
         }
     }
 
-    assert(candidate_len >= 0);
-    /* Final check for strings that contain nothing but whitespace. */
-    if (candidate_len == 0) {
-        Py_INCREF(unicode);
-        return unicode;
+    assert(_len >= 0);
+    if (_len > 0) {
+        *output = _start;
+    }
+    return _len;
+}
+
+/* Dedent a string.
+   Behaviour is expected to be an exact match of `textwrap.dedent`.
+   Return a new reference on success, NULL with exception set on error.
+   */
+PyObject *
+_PyUnicode_Dedent(PyObject *unicode)
+{
+    Py_ssize_t src_len = 0;
+    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
+    if (!src) {
+        return NULL;
+    }
+    assert(src_len >= 0);
+    if (src_len == 0) {
+        return Py_NewRef(unicode);
+    }
+
+    const char *const end = src + src_len;
+
+    // [whitespace_start, whitespace_start + whitespace_len)
+    // describes the current longest common leading whitespace
+    const char *whitespace_start = NULL;
+    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
+        src, end, &whitespace_start);
+
+    if (whitespace_len == 0) {
+        return Py_NewRef(unicode);
     }
 
     // now we should trigger a dedent
@@ -14390,12 +14408,12 @@ _PyUnicode_Dedent(PyObject *unicode)
             continue;
         }
 
-        /* copy [new_line_start + candidate_len, iter) to buffer, then
+        /* copy [new_line_start + whitespace_len, iter) to buffer, then
             conditionally append '\n' */
 
-        Py_ssize_t new_line_len = iter - line_start - candidate_len;
+        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
         assert(new_line_len >= 0);
-        memcpy(dest_iter, line_start + candidate_len, new_line_len);
+        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
 
         dest_iter += new_line_len;