Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Include/internal/pycore_unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
Behaviour is expected to be an exact match of `textwrap.dedent`.
Return a new reference on success, NULL with exception set on error.
*/
extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode);

/* --- Misc functions ----------------------------------------------------- */

Expand Down
89 changes: 89 additions & 0 deletions Lib/test/test_capi/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,95 @@ def test_transform_decimal_and_space(self):
self.assertRaises(SystemError, transform_decimal, [])
# CRASHES transform_decimal(NULL)

@support.cpython_only
@unittest.skipIf(_testinternalcapi is None,'need _testinternalcapi module')
def test_dedent(self):
from _testinternalcapi import _PyUnicode_Dedent as dedent
self.assertEqual('hello\nworld', dedent(' hello\n world'))
self.assertEqual('hello\nmy\n friend', dedent(' hello\n my\n friend'))

# Only spaces.
text = " "
expect = ""
self.assertEqual(expect, dedent(text))

# Only tabs.
text = "\t\t\t\t"
expect = ""
self.assertEqual(expect, dedent(text))

# A mixture.
text = " \t \t\t \t "
expect = ""
self.assertEqual(expect, dedent(text))

# ASCII whitespace.
text = "\f\n\r\t\v "
expect = "\n"
self.assertEqual(expect, dedent(text))

# One newline.
text = "\n"
expect = "\n"
self.assertEqual(expect, dedent(text))

# Windows-style newlines.
text = "\r\n" * 5
expect = "\n" * 5
self.assertEqual(expect, dedent(text))

# Whitespace mixture.
text = " \n\t\n \n\t\t\n\n\n "
expect = "\n\n\n\n\n\n"
self.assertEqual(expect, dedent(text))

# Lines consisting only of whitespace are always normalised
text = "a\n \n\t\n"
expect = "a\n\n\n"
self.assertEqual(expect, dedent(text))

# Whitespace characters on non-empty lines are retained
text = "a\r\n\r\n\r\n"
expect = "a\r\n\n\n"
self.assertEqual(expect, dedent(text))

# Uneven indentation with declining indent level.
text = " Foo\n Bar\n" # 5 spaces, then 4
expect = " Foo\nBar\n"
self.assertEqual(expect, dedent(text))

# Declining indent level with blank line.
text = " Foo\n\n Bar\n" # 5 spaces, blank, then 4
expect = " Foo\n\nBar\n"
self.assertEqual(expect, dedent(text))

# Declining indent level with whitespace only line.
text = " Foo\n \n Bar\n" # 5 spaces, then 4, then 4
expect = " Foo\n\nBar\n"
self.assertEqual(expect, dedent(text))

text = " hello\tthere\n how are\tyou?"
expect = "hello\tthere\nhow are\tyou?"
self.assertEqual(expect, dedent(text))

# dedent() only removes whitespace that can be uniformly removed!
text = "\thello there\n\thow are you?"
expect = "hello there\nhow are you?"
self.assertEqual(expect, dedent(text))

text = '''\
def foo():
while 1:
return foo
'''
expect = '''\
def foo():
while 1:
return foo
'''
self.assertEqual(expect, dedent(text))


@support.cpython_only
@unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
def test_concat(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:option:`-c` now dedents like :func:`textwrap.dedent`
14 changes: 13 additions & 1 deletion Modules/_testinternalcapi.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#include "pycore_pyerrors.h" // _PyErr_ChainExceptions1()
#include "pycore_pylifecycle.h" // _PyInterpreterConfig_InitFromDict()
#include "pycore_pystate.h" // _PyThreadState_GET()
#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII()
#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII() / _PyUnicode_Dedent()

#include "clinic/_testinternalcapi.c.h"

Expand Down Expand Up @@ -1416,6 +1416,17 @@ unicode_transformdecimalandspacetoascii(PyObject *self, PyObject *arg)
return _PyUnicode_TransformDecimalAndSpaceToASCII(arg);
}

/* Test _PyUnicode_Dedent() */
static PyObject *
unicode_dedent(PyObject *self, PyObject *arg)
{
if (arg == Py_None) {
arg = NULL;
}
return _PyUnicode_Dedent(arg);
}


static PyObject *
test_pyobject_is_freed(const char *test_name, PyObject *op)
{
Expand Down Expand Up @@ -2422,6 +2433,7 @@ static PyMethodDef module_functions[] = {
{"_PyTraceMalloc_GetTraceback", tracemalloc_get_traceback, METH_VARARGS},
{"test_tstate_capi", test_tstate_capi, METH_NOARGS, NULL},
{"_PyUnicode_TransformDecimalAndSpaceToASCII", unicode_transformdecimalandspacetoascii, METH_O},
{"_PyUnicode_Dedent", unicode_dedent, METH_O},
{"check_pyobject_forbidden_bytes_is_freed",
check_pyobject_forbidden_bytes_is_freed, METH_NOARGS},
{"check_pyobject_freed_is_freed", check_pyobject_freed_is_freed, METH_NOARGS},
Expand Down
196 changes: 81 additions & 115 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -14309,83 +14309,65 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
}

/*
This function searchs the longest common leading whitespace
of all lines in the [src, end).
It returns the length of the common leading whitespace and sets `output` to
point to the beginning of the common leading whitespace if length > 0.
Find the longest common leading whitespace among a list of lines.
Whitespace-only lines are ignored.
Returns the margin length (>= 0).
*/
static Py_ssize_t
search_longest_common_leading_whitespace(
const char *const src,
const char *const end,
const char **output)
{
// [_start, _start + _len)
// describes the current longest common leading whitespace
const char *_start = NULL;
Py_ssize_t _len = 0;

for (const char *iter = src; iter < end; ++iter) {
const char *line_start = iter;
const char *leading_whitespace_end = NULL;

// scan the whole line
while (iter < end && *iter != '\n') {
if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
/* `iter` points to the first non-whitespace character
in this line */
if (iter == line_start) {
// some line has no indent, fast exit!
return 0;
}
leading_whitespace_end = iter;
}
++iter;
}
search_longest_common_leading_whitespace(PyObject *lines, Py_ssize_t nlines)
{
PyObject *smallest = NULL, *largest = NULL;
for (Py_ssize_t i = 0; i < nlines; i++) {
PyObject *line = PyList_GET_ITEM(lines, i);
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);

// if this line has all white space, skip it
if (!leading_whitespace_end) {
if (linelen == 0) {
continue;
}

if (!_start) {
// update the first leading whitespace
_start = line_start;
_len = leading_whitespace_end - line_start;
assert(_len > 0);
int kind = PyUnicode_KIND(line);
void *data = PyUnicode_DATA(line);
int all_ws = 1;
for (Py_ssize_t j = 0; j < linelen; j++) {
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
all_ws = 0;
break;
}
}
if (all_ws) {
continue;
}
else {
/* We then compare with the current longest leading whitespace.

[line_start, leading_whitespace_end) is the leading
whitespace of this line,
if (smallest == NULL || PyObject_RichCompareBool(line, smallest, Py_LT)) {
smallest = line;
}
if (largest == NULL || PyObject_RichCompareBool(line, largest, Py_GT)) {
largest = line;
}
}

[_start, _start + _len) is the leading whitespace of the
current longest leading whitespace. */
Py_ssize_t new_len = 0;
const char *_iter = _start, *line_iter = line_start;
if (smallest == NULL || largest == NULL) {
return 0;
}

while (_iter < _start + _len && line_iter < leading_whitespace_end
&& *_iter == *line_iter)
{
++_iter;
++line_iter;
++new_len;
}
Py_ssize_t margin = 0;
Py_ssize_t minlen = Py_MIN(PyUnicode_GET_LENGTH(smallest),
PyUnicode_GET_LENGTH(largest));
int skind = PyUnicode_KIND(smallest);
int lkind = PyUnicode_KIND(largest);
const void *sdata = PyUnicode_DATA(smallest);
const void *ldata = PyUnicode_DATA(largest);

_len = new_len;
if (_len == 0) {
// No common things now, fast exit!
return 0;
}
while (margin < minlen) {
Py_UCS4 c1 = PyUnicode_READ(skind, sdata, margin);
Py_UCS4 c2 = PyUnicode_READ(lkind, ldata, margin);
if (c1 != c2 || !(c1 == ' ' || c1 == '\t')) {
break;
}
margin++;
}

assert(_len >= 0);
if (_len > 0) {
*output = _start;
}
return _len;
return margin;
}

/* Dedent a string.
Expand All @@ -14395,74 +14377,58 @@ search_longest_common_leading_whitespace(
PyObject *
_PyUnicode_Dedent(PyObject *unicode)
{
Py_ssize_t src_len = 0;
const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
if (!src) {
PyObject *sep = PyUnicode_FromString("\n");
if (sep == NULL) {
return NULL;
}
assert(src_len >= 0);
if (src_len == 0) {
return Py_NewRef(unicode);
}

const char *const end = src + src_len;

// [whitespace_start, whitespace_start + whitespace_len)
// describes the current longest common leading whitespace
const char *whitespace_start = NULL;
Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
src, end, &whitespace_start);

if (whitespace_len == 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep the fast path.

Copy link
Member Author

@StanFromIreland StanFromIreland Sep 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't, we need to clear lines, see the tests.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really need to do it? in general there is nothing to dedent, so it'll slow down normal cases. Even if the comment says that it's meant to match textwrap.dedent(), I don't think it's needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we want to respect the behavior of textwrap.dedent, as is noted in What's new 3.14, the docs, and the comments. Then yes.

Copy link
Member

@picnixz picnixz Sep 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But I don't think we need to! It's an internal function. textwrap.dedent is implemented in pure Python.

Copy link
Member Author

@StanFromIreland StanFromIreland Sep 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But our documentation says we do:

Whats New 3.14

The auto-dedentation behavior mirrors textwrap.dedent().

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you would rather just remove the false claims, we can do: https://github.com/python/cpython/compare/main...StanFromIreland:remove-misleading-notes?expand=1

Though I think this may cause confusion in the future.

Copy link
Member

@picnixz picnixz Sep 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The auto-dedentation behavior mirrors textwrap.dedent().

Yes, but we can just say that we don't normalize whitespaces and only consider spaces and tabs. No one should ever write spaces with other space-like characters. Let's just amend the NEWS. As for str.dedent(), it will need a PEP which still doesn't exist and the discussion seems stalled IMO.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I am working on the PEP :-)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opened #138620 with your alternative, though I still think fixing it is better.

return Py_NewRef(unicode);
PyObject *lines = PyUnicode_Split(unicode, sep, -1);
Py_DECREF(sep);
if (lines == NULL) {
return NULL;
}
Py_ssize_t nlines = PyList_GET_SIZE(lines);
Py_ssize_t margin = search_longest_common_leading_whitespace(lines, nlines);

// now we should trigger a dedent
char *dest = PyMem_Malloc(src_len);
if (!dest) {
PyErr_NoMemory();
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
Py_DECREF(lines);
return NULL;
}
char *dest_iter = dest;

for (const char *iter = src; iter < end; ++iter) {
const char *line_start = iter;
bool in_leading_space = true;
for (Py_ssize_t i = 0; i < nlines; i++) {
PyObject *line = PyList_GET_ITEM(lines, i);
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);

// iterate over a line to find the end of a line
while (iter < end && *iter != '\n') {
if (in_leading_space && *iter != ' ' && *iter != '\t') {
in_leading_space = false;
int all_ws = 1;
int kind = PyUnicode_KIND(line);
void *data = PyUnicode_DATA(line);
for (Py_ssize_t j = 0; j < linelen; j++) {
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
all_ws = 0;
break;
}
++iter;
}

// invariant: *iter == '\n' or iter == end
bool append_newline = iter < end;

// if this line has all white space, write '\n' and continue
if (in_leading_space && append_newline) {
*dest_iter++ = '\n';
continue;
if (!all_ws) {
Py_ssize_t start = Py_MIN(margin, linelen);
if (PyUnicodeWriter_WriteSubstring(writer, line, start, linelen) < 0) {
PyUnicodeWriter_Discard(writer);
Py_DECREF(lines);
return NULL;
}
}

/* copy [new_line_start + whitespace_len, iter) to buffer, then
conditionally append '\n' */

Py_ssize_t new_line_len = iter - line_start - whitespace_len;
assert(new_line_len >= 0);
memcpy(dest_iter, line_start + whitespace_len, new_line_len);

dest_iter += new_line_len;

if (append_newline) {
*dest_iter++ = '\n';
if (i < nlines - 1) {
if (PyUnicodeWriter_WriteChar(writer, '\n') < 0) {
PyUnicodeWriter_Discard(writer);
Py_DECREF(lines);
return NULL;
}
}
}

PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
PyMem_Free(dest);
return res;
Py_DECREF(lines);
return PyUnicodeWriter_Finish(writer);
}

static PyMethodDef unicode_methods[] = {
Expand Down
Loading