Skip to content

Commit f6ace9d

Browse files
Add comment & un-refactor
1 parent 40bcdea commit f6ace9d

File tree

2 files changed

+114
-83
lines changed

2 files changed

+114
-83
lines changed

Include/internal/pycore_unicodeobject.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,9 @@ extern Py_ssize_t _PyUnicode_InsertThousandsGrouping(
259259

260260
/* Dedent a string.
261261
Behaviour is expected to be an exact match of `textwrap.dedent`.
262-
Return a new reference on success, NULL with exception set on error.
262+
Return a new reference on success, NULL with an exception set on error.
263+
264+
Export for test_capi.test_unicode
263265
*/
264266
PyAPI_FUNC(PyObject*) _PyUnicode_Dedent(PyObject *unicode);
265267

Objects/unicodeobject.c

Lines changed: 111 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -14309,126 +14309,155 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
1430914309
}
1431014310

1431114311
/*
14312-
Find the longest common leading whitespace among a list of lines.
14313-
Whitespace-only lines are ignored.
14314-
Returns the margin length (>= 0).
14312+
This function searches the longest common leading whitespace
14313+
of all lines in the [src, end).
14314+
It returns the length of the common leading whitespace and sets *output* to
14315+
point to the beginning of the common leading whitespace if length > 0.
1431514316
*/
1431614317
static Py_ssize_t
14317-
search_longest_common_leading_whitespace(PyObject *lines, Py_ssize_t nlines)
14318-
{
14319-
PyObject *smallest = NULL, *largest = NULL;
14320-
for (Py_ssize_t i = 0; i < nlines; i++) {
14321-
PyObject *line = PyList_GET_ITEM(lines, i);
14322-
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
14323-
14324-
if (linelen == 0) {
14325-
continue;
14326-
}
14327-
14328-
int kind = PyUnicode_KIND(line);
14329-
void *data = PyUnicode_DATA(line);
14330-
int all_ws = 1;
14331-
for (Py_ssize_t j = 0; j < linelen; j++) {
14332-
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
14333-
all_ws = 0;
14334-
break;
14318+
search_longest_common_leading_whitespace(
14319+
const char *const src,
14320+
const char *const end,
14321+
const char **output)
14322+
{
14323+
// [_start, _start + _len)
14324+
// describes the current longest common leading whitespace
14325+
const char *_start = NULL;
14326+
Py_ssize_t _len = 0;
14327+
14328+
for (const char *iter = src; iter < end; ++iter) {
14329+
const char *line_start = iter;
14330+
const char *leading_whitespace_end = NULL;
14331+
14332+
// scan the whole line
14333+
while (iter < end && *iter != '\n') {
14334+
if (!leading_whitespace_end && !Py_ISSPACE(Py_CHARMASK(*iter))) {
14335+
if (iter == line_start) {
14336+
// some line has no indent, fast exit!
14337+
return 0;
14338+
}
14339+
leading_whitespace_end = iter;
1433514340
}
14341+
++iter;
1433614342
}
14337-
if (all_ws) {
14343+
14344+
// if this line has all white space, skip it
14345+
if (!leading_whitespace_end) {
1433814346
continue;
1433914347
}
1434014348

14341-
if (smallest == NULL || PyObject_RichCompareBool(line, smallest, Py_LT)) {
14342-
smallest = line;
14349+
if (!_start) {
14350+
// update the first leading whitespace
14351+
_start = line_start;
14352+
_len = leading_whitespace_end - line_start;
14353+
assert(_len > 0);
1434314354
}
14344-
if (largest == NULL || PyObject_RichCompareBool(line, largest, Py_GT)) {
14345-
largest = line;
14346-
}
14347-
}
14355+
else {
14356+
/* We then compare with the current longest leading whitespace.
1434814357
14349-
if (smallest == NULL || largest == NULL) {
14350-
return 0;
14351-
}
14358+
[line_start, leading_whitespace_end) is the leading
14359+
whitespace of this line,
1435214360
14353-
Py_ssize_t margin = 0;
14354-
Py_ssize_t minlen = Py_MIN(PyUnicode_GET_LENGTH(smallest),
14355-
PyUnicode_GET_LENGTH(largest));
14356-
int skind = PyUnicode_KIND(smallest);
14357-
int lkind = PyUnicode_KIND(largest);
14358-
const void *sdata = PyUnicode_DATA(smallest);
14359-
const void *ldata = PyUnicode_DATA(largest);
14361+
[_start, _start + _len) is the leading whitespace of the
14362+
current longest leading whitespace. */
14363+
Py_ssize_t new_len = 0;
14364+
const char *_iter = _start, *line_iter = line_start;
1436014365

14361-
while (margin < minlen) {
14362-
Py_UCS4 c1 = PyUnicode_READ(skind, sdata, margin);
14363-
Py_UCS4 c2 = PyUnicode_READ(lkind, ldata, margin);
14364-
if (c1 != c2 || !(c1 == ' ' || c1 == '\t')) {
14365-
break;
14366+
while (_iter < _start + _len && line_iter < leading_whitespace_end
14367+
&& *_iter == *line_iter)
14368+
{
14369+
++_iter;
14370+
++line_iter;
14371+
++new_len;
14372+
}
14373+
14374+
_len = new_len;
14375+
if (_len == 0) {
14376+
// No common things now, fast exit!
14377+
return 0;
14378+
}
1436614379
}
14367-
margin++;
1436814380
}
1436914381

14370-
return margin;
14382+
assert(_len >= 0);
14383+
if (_len > 0) {
14384+
*output = _start;
14385+
}
14386+
return _len;
1437114387
}
1437214388

1437314389
/* Dedent a string.
14374-
Behaviour is expected to be an exact match of `textwrap.dedent`.
14375-
Return a new reference on success, NULL with exception set on error.
14390+
Behaviour is expected to be an exact match of textwrap.dedent.
14391+
Return a new reference on success, NULL with an exception set on error.
1437614392
*/
1437714393
PyObject *
1437814394
_PyUnicode_Dedent(PyObject *unicode)
1437914395
{
14380-
PyObject *sep = PyUnicode_FromString("\n");
14381-
if (sep == NULL) {
14396+
Py_ssize_t src_len = 0;
14397+
const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14398+
if (!src) {
1438214399
return NULL;
1438314400
}
14384-
PyObject *lines = PyUnicode_Split(unicode, sep, -1);
14385-
Py_DECREF(sep);
14386-
if (lines == NULL) {
14387-
return NULL;
14401+
assert(src_len >= 0);
14402+
if (src_len == 0) {
14403+
return Py_NewRef(unicode);
1438814404
}
14389-
Py_ssize_t nlines = PyList_GET_SIZE(lines);
14390-
Py_ssize_t margin = search_longest_common_leading_whitespace(lines, nlines);
1439114405

14392-
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
14393-
if (writer == NULL) {
14394-
Py_DECREF(lines);
14406+
const char *const end = src + src_len;
14407+
14408+
// [whitespace_start, whitespace_start + whitespace_len)
14409+
// describes the current longest common leading whitespace
14410+
const char *whitespace_start = NULL;
14411+
const Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14412+
src, end, &whitespace_start);
14413+
14414+
// now we should trigger a dedent
14415+
char *dest = PyMem_Malloc(src_len);
14416+
if (!dest) {
14417+
PyErr_NoMemory();
1439514418
return NULL;
1439614419
}
14420+
char *dest_iter = dest;
1439714421

14398-
for (Py_ssize_t i = 0; i < nlines; i++) {
14399-
PyObject *line = PyList_GET_ITEM(lines, i);
14400-
Py_ssize_t linelen = PyUnicode_GET_LENGTH(line);
14422+
for (const char *iter = src; iter < end; ++iter) {
14423+
const char *line_start = iter;
14424+
bool in_leading_space = true;
1440114425

14402-
int all_ws = 1;
14403-
int kind = PyUnicode_KIND(line);
14404-
void *data = PyUnicode_DATA(line);
14405-
for (Py_ssize_t j = 0; j < linelen; j++) {
14406-
if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) {
14407-
all_ws = 0;
14408-
break;
14426+
// iterate over a line to find the end of a line
14427+
while (iter < end && *iter != '\n') {
14428+
if (in_leading_space && !Py_ISSPACE(Py_CHARMASK(*iter))) {
14429+
in_leading_space = false;
1440914430
}
14431+
++iter;
1441014432
}
1441114433

14412-
if (!all_ws) {
14413-
Py_ssize_t start = Py_MIN(margin, linelen);
14414-
if (PyUnicodeWriter_WriteSubstring(writer, line, start, linelen) < 0) {
14415-
PyUnicodeWriter_Discard(writer);
14416-
Py_DECREF(lines);
14417-
return NULL;
14434+
// invariant: *iter == '\n' or iter == end
14435+
const bool append_newline = iter < end;
14436+
14437+
// if this line has all white space, write '\n' and continue
14438+
if (in_leading_space) {
14439+
if (append_newline) {
14440+
*dest_iter++ = '\n';
1441814441
}
14442+
continue;
1441914443
}
1442014444

14421-
if (i < nlines - 1) {
14422-
if (PyUnicodeWriter_WriteChar(writer, '\n') < 0) {
14423-
PyUnicodeWriter_Discard(writer);
14424-
Py_DECREF(lines);
14425-
return NULL;
14426-
}
14445+
/* copy [new_line_start + whitespace_len, iter) to buffer, then
14446+
conditionally append '\n' */
14447+
const Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14448+
assert(new_line_len >= 0);
14449+
memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14450+
14451+
dest_iter += new_line_len;
14452+
14453+
if (append_newline) {
14454+
*dest_iter++ = '\n';
1442714455
}
1442814456
}
1442914457

14430-
Py_DECREF(lines);
14431-
return PyUnicodeWriter_Finish(writer);
14458+
PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14459+
PyMem_Free(dest);
14460+
return res;
1443214461
}
1443314462

1443414463
static PyMethodDef unicode_methods[] = {

0 commit comments

Comments
 (0)