Skip to content

Commit 40bcdea

Browse files
Commit
1 parent c919d02 commit 40bcdea

File tree

5 files changed

+185
-117
lines changed

5 files changed

+185
-117
lines changed

Include/internal/pycore_unicodeobject.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
261261
Behaviour is expected to be an exact match of `textwrap.dedent`.
262262
Return a new reference on success, NULL with exception set on error.
263263
*/
264-
extern PyObject* _PyUnicode_Dedent(PyObject *unicode);
264+
PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode);
265265

266266
/* --- Misc functions ----------------------------------------------------- */
267267

Lib/test/test_capi/test_unicode.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,6 +1074,95 @@ def test_transform_decimal_and_space(self):
10741074
self.assertRaises(SystemError, transform_decimal, [])
10751075
# CRASHES transform_decimal(NULL)
10761076

1077+
@support.cpython_only
1078+
@unittest.skipIf(_testinternalcapi is None,'need _testinternalcapi module')
1079+
def test_dedent(self):
1080+
from _testinternalcapi import _PyUnicode_Dedent as dedent
1081+
self.assertEqual('hello\nworld', dedent(' hello\n world'))
1082+
self.assertEqual('hello\nmy\n friend', dedent(' hello\n my\n friend'))
1083+
1084+
# Only spaces.
1085+
text = " "
1086+
expect = ""
1087+
self.assertEqual(expect, dedent(text))
1088+
1089+
# Only tabs.
1090+
text = "\t\t\t\t"
1091+
expect = ""
1092+
self.assertEqual(expect, dedent(text))
1093+
1094+
# A mixture.
1095+
text = " \t \t\t \t "
1096+
expect = ""
1097+
self.assertEqual(expect, dedent(text))
1098+
1099+
# ASCII whitespace.
1100+
text = "\f\n\r\t\v "
1101+
expect = "\n"
1102+
self.assertEqual(expect, dedent(text))
1103+
1104+
# One newline.
1105+
text = "\n"
1106+
expect = "\n"
1107+
self.assertEqual(expect, dedent(text))
1108+
1109+
# Windows-style newlines.
1110+
text = "\r\n" * 5
1111+
expect = "\n" * 5
1112+
self.assertEqual(expect, dedent(text))
1113+
1114+
# Whitespace mixture.
1115+
text = " \n\t\n \n\t\t\n\n\n "
1116+
expect = "\n\n\n\n\n\n"
1117+
self.assertEqual(expect, dedent(text))
1118+
1119+
# Lines consisting only of whitespace are always normalised
1120+
text = "a\n \n\t\n"
1121+
expect = "a\n\n\n"
1122+
self.assertEqual(expect, dedent(text))
1123+
1124+
# Whitespace characters on non-empty lines are retained
1125+
text = "a\r\n\r\n\r\n"
1126+
expect = "a\r\n\n\n"
1127+
self.assertEqual(expect, dedent(text))
1128+
1129+
# Uneven indentation with declining indent level.
1130+
text = " Foo\n Bar\n" # 5 spaces, then 4
1131+
expect = " Foo\nBar\n"
1132+
self.assertEqual(expect, dedent(text))
1133+
1134+
# Declining indent level with blank line.
1135+
text = " Foo\n\n Bar\n" # 5 spaces, blank, then 4
1136+
expect = " Foo\n\nBar\n"
1137+
self.assertEqual(expect, dedent(text))
1138+
1139+
# Declining indent level with whitespace only line.
1140+
text = " Foo\n \n Bar\n" # 5 spaces, then 4, then 4
1141+
expect = " Foo\n\nBar\n"
1142+
self.assertEqual(expect, dedent(text))
1143+
1144+
text = " hello\tthere\n how are\tyou?"
1145+
expect = "hello\tthere\nhow are\tyou?"
1146+
self.assertEqual(expect, dedent(text))
1147+
1148+
# dedent() only removes whitespace that can be uniformly removed!
1149+
text = "\thello there\n\thow are you?"
1150+
expect = "hello there\nhow are you?"
1151+
self.assertEqual(expect, dedent(text))
1152+
1153+
text = '''\
1154+
def foo():
1155+
while 1:
1156+
return foo
1157+
'''
1158+
expect = '''\
1159+
def foo():
1160+
while 1:
1161+
return foo
1162+
'''
1163+
self.assertEqual(expect, dedent(text))
1164+
1165+
10771166
@support.cpython_only
10781167
@unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
10791168
def test_concat(self):
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:option:`-c` now dedents like :func:`textwrap.dedent`

Modules/_testinternalcapi.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#include "pycore_pyerrors.h" // _PyErr_ChainExceptions1()
3535
#include "pycore_pylifecycle.h" // _PyInterpreterConfig_InitFromDict()
3636
#include "pycore_pystate.h" // _PyThreadState_GET()
37-
#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII()
37+
#include "pycore_unicodeobject.h" // _PyUnicode_TransformDecimalAndSpaceToASCII() / _PyUnicode_Dedent()
3838

3939
#include "clinic/_testinternalcapi.c.h"
4040

@@ -1416,6 +1416,17 @@ unicode_transformdecimalandspacetoascii(PyObject *self, PyObject *arg)
14161416
return _PyUnicode_TransformDecimalAndSpaceToASCII(arg);
14171417
}
14181418

1419+
/* Test _PyUnicode_Dedent() */
1420+
static PyObject *
1421+
unicode_dedent(PyObject *self, PyObject *arg)
1422+
{
1423+
if (arg == Py_None) {
1424+
arg = NULL;
1425+
}
1426+
return _PyUnicode_Dedent(arg);
1427+
}
1428+
1429+
14191430
static PyObject *
14201431
test_pyobject_is_freed(const char *test_name, PyObject *op)
14211432
{
@@ -2422,6 +2433,7 @@ static PyMethodDef module_functions[] = {
24222433
{"_PyTraceMalloc_GetTraceback", tracemalloc_get_traceback, METH_VARARGS},
24232434
{"test_tstate_capi", test_tstate_capi, METH_NOARGS, NULL},
24242435
{"_PyUnicode_TransformDecimalAndSpaceToASCII", unicode_transformdecimalandspacetoascii, METH_O},
2436+
{"_PyUnicode_Dedent", unicode_dedent, METH_O},
24252437
{"check_pyobject_forbidden_bytes_is_freed",
24262438
check_pyobject_forbidden_bytes_is_freed, METH_NOARGS},
24272439
{"check_pyobject_freed_is_freed", check_pyobject_freed_is_freed, METH_NOARGS},

Objects/unicodeobject.c

Lines changed: 81 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -14309,83 +14309,65 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
1430914309
}
1431014310

1431114311
/*
14312-
This function searchs the longest common leading whitespace
14313-
of all lines in the [src, end).
14314-
It returns the length of the common leading whitespace and sets `output` to
14315-
point to the beginning of the common leading whitespace if length > 0.
14312+
Find the longest common leading whitespace among a list of lines.
14313+
Whitespace-only lines are ignored.
14314+
Returns the margin length (>= 0).
1431614315
*/
1431714316
static Py_ssize_t
14318-
search_longest_common_leading_whitespace(
14319-
const char *const src,
14320-
const char *const end,
14321-
const char **output)
14322-
{
14323-
// [_start, _start + _len)
14324-
// describes the current longest common leading whitespace
14325-
const char *_start = NULL;
14326-
Py_ssize_t _len = 0;
14327-
14328-
for (const char *iter = src; iter < end; ++iter) {
14329-
const char *line_start = iter;
14330-
const char *leading_whitespace_end = NULL;
14331-
14332-
// scan the whole line
14333-
while (iter < end && *iter != '\n') {
14334-
if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14335-
/* `iter` points to the first non-whitespace character
14336-
in this line */
14337-
if (iter == line_start) {
14338-
// some line has no indent, fast exit!
14339-
return 0;
14340-
}
14341-
leading_whitespace_end = iter;
14342-
}
14343-
++iter;
14344-
}
14317+
search_longest_common_leading_whitespace(PyObject *lines, Py_ssize_t nlines)
14318+
{
14319+
PyObject *smallest = NULL, *largest = NULL;
14320+
for (Py_ssize_t i = 0; i < nlines; i++) {
14321+
PyObject *line = PyList_GET_ITEM(lines, i);
14322+
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
1434514323

14346-
// if this line has all white space, skip it
14347-
if (!leading_whitespace_end) {
14324+
if (linelen == 0) {
1434814325
continue;
1434914326
}
1435014327

14351-
if (!_start) {
14352-
// update the first leading whitespace
14353-
_start = line_start;
14354-
_len = leading_whitespace_end - line_start;
14355-
assert(_len > 0);
14328+
int kind = PyUnicode_KIND(line);
14329+
void *data = PyUnicode_DATA(line);
14330+
int all_ws = 1;
14331+
for (Py_ssize_t j = 0; j < linelen; j++) {
14332+
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
14333+
all_ws = 0;
14334+
break;
14335+
}
14336+
}
14337+
if (all_ws) {
14338+
continue;
1435614339
}
14357-
else {
14358-
/* We then compare with the current longest leading whitespace.
1435914340

14360-
[line_start, leading_whitespace_end) is the leading
14361-
whitespace of this line,
14341+
if (smallest == NULL || PyObject_RichCompareBool(line, smallest, Py_LT)) {
14342+
smallest = line;
14343+
}
14344+
if (largest == NULL || PyObject_RichCompareBool(line, largest, Py_GT)) {
14345+
largest = line;
14346+
}
14347+
}
1436214348

14363-
[_start, _start + _len) is the leading whitespace of the
14364-
current longest leading whitespace. */
14365-
Py_ssize_t new_len = 0;
14366-
const char *_iter = _start, *line_iter = line_start;
14349+
if (smallest == NULL || largest == NULL) {
14350+
return 0;
14351+
}
1436714352

14368-
while (_iter < _start + _len && line_iter < leading_whitespace_end
14369-
&& *_iter == *line_iter)
14370-
{
14371-
++_iter;
14372-
++line_iter;
14373-
++new_len;
14374-
}
14353+
Py_ssize_t margin = 0;
14354+
Py_ssize_t minlen = Py_MIN(PyUnicode_GET_LENGTH(smallest),
14355+
PyUnicode_GET_LENGTH(largest));
14356+
int skind = PyUnicode_KIND(smallest);
14357+
int lkind = PyUnicode_KIND(largest);
14358+
const void *sdata = PyUnicode_DATA(smallest);
14359+
const void *ldata = PyUnicode_DATA(largest);
1437514360

14376-
_len = new_len;
14377-
if (_len == 0) {
14378-
// No common things now, fast exit!
14379-
return 0;
14380-
}
14361+
while (margin < minlen) {
14362+
Py_UCS4 c1 = PyUnicode_READ(skind, sdata, margin);
14363+
Py_UCS4 c2 = PyUnicode_READ(lkind, ldata, margin);
14364+
if (c1 != c2 || !(c1 == ' ' || c1 == '\t')) {
14365+
break;
1438114366
}
14367+
margin++;
1438214368
}
1438314369

14384-
assert(_len >= 0);
14385-
if (_len > 0) {
14386-
*output = _start;
14387-
}
14388-
return _len;
14370+
return margin;
1438914371
}
1439014372

1439114373
/* Dedent a string.
@@ -14395,74 +14377,58 @@ search_longest_common_leading_whitespace(
1439514377
PyObject *
1439614378
_PyUnicode_Dedent(PyObject *unicode)
1439714379
{
14398-
Py_ssize_t src_len = 0;
14399-
const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14400-
if (!src) {
14380+
PyObject *sep = PyUnicode_FromString("\n");
14381+
if (sep == NULL) {
1440114382
return NULL;
1440214383
}
14403-
assert(src_len >= 0);
14404-
if (src_len == 0) {
14405-
return Py_NewRef(unicode);
14406-
}
14407-
14408-
const char *const end = src + src_len;
14409-
14410-
// [whitespace_start, whitespace_start + whitespace_len)
14411-
// describes the current longest common leading whitespace
14412-
const char *whitespace_start = NULL;
14413-
Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14414-
src, end, &whitespace_start);
14415-
14416-
if (whitespace_len == 0) {
14417-
return Py_NewRef(unicode);
14384+
PyObject *lines = PyUnicode_Split(unicode, sep, -1);
14385+
Py_DECREF(sep);
14386+
if (lines == NULL) {
14387+
return NULL;
1441814388
}
14389+
Py_ssize_t nlines = PyList_GET_SIZE(lines);
14390+
Py_ssize_t margin = search_longest_common_leading_whitespace(lines, nlines);
1441914391

14420-
// now we should trigger a dedent
14421-
char *dest = PyMem_Malloc(src_len);
14422-
if (!dest) {
14423-
PyErr_NoMemory();
14392+
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
14393+
if (writer == NULL) {
14394+
Py_DECREF(lines);
1442414395
return NULL;
1442514396
}
14426-
char *dest_iter = dest;
1442714397

14428-
for (const char *iter = src; iter < end; ++iter) {
14429-
const char *line_start = iter;
14430-
bool in_leading_space = true;
14398+
for (Py_ssize_t i = 0; i < nlines; i++) {
14399+
PyObject *line = PyList_GET_ITEM(lines, i);
14400+
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
1443114401

14432-
// iterate over a line to find the end of a line
14433-
while (iter < end && *iter != '\n') {
14434-
if (in_leading_space && *iter != ' ' && *iter != '\t') {
14435-
in_leading_space = false;
14402+
int all_ws = 1;
14403+
int kind = PyUnicode_KIND(line);
14404+
void *data = PyUnicode_DATA(line);
14405+
for (Py_ssize_t j = 0; j < linelen; j++) {
14406+
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
14407+
all_ws = 0;
14408+
break;
1443614409
}
14437-
++iter;
1443814410
}
1443914411

14440-
// invariant: *iter == '\n' or iter == end
14441-
bool append_newline = iter < end;
14442-
14443-
// if this line has all white space, write '\n' and continue
14444-
if (in_leading_space && append_newline) {
14445-
*dest_iter++ = '\n';
14446-
continue;
14412+
if (!all_ws) {
14413+
Py_ssize_t start = Py_MIN(margin, linelen);
14414+
if (PyUnicodeWriter_WriteSubstring(writer, line, start, linelen) < 0) {
14415+
PyUnicodeWriter_Discard(writer);
14416+
Py_DECREF(lines);
14417+
return NULL;
14418+
}
1444714419
}
1444814420

14449-
/* copy [new_line_start + whitespace_len, iter) to buffer, then
14450-
conditionally append '\n' */
14451-
14452-
Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14453-
assert(new_line_len >= 0);
14454-
memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14455-
14456-
dest_iter += new_line_len;
14457-
14458-
if (append_newline) {
14459-
*dest_iter++ = '\n';
14421+
if (i < nlines - 1) {
14422+
if (PyUnicodeWriter_WriteChar(writer, '\n') < 0) {
14423+
PyUnicodeWriter_Discard(writer);
14424+
Py_DECREF(lines);
14425+
return NULL;
14426+
}
1446014427
}
1446114428
}
1446214429

14463-
PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14464-
PyMem_Free(dest);
14465-
return res;
14430+
Py_DECREF(lines);
14431+
return PyUnicodeWriter_Finish(writer);
1446614432
}
1446714433

1446814434
static PyMethodDef unicode_methods[] = {

0 commit comments

Comments
 (0)