-
-
Notifications
You must be signed in to change notification settings - Fork 33.3k
gh-103997: Fix and test _PyUnicode_Dedent
#138620
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| :option:`-c` now dedents like :func:`textwrap.dedent` |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14309,83 +14309,65 @@ unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored)) | |
| } | ||
|
|
||
| /* | ||
| This function searchs the longest common leading whitespace | ||
| of all lines in the [src, end). | ||
| It returns the length of the common leading whitespace and sets `output` to | ||
| point to the beginning of the common leading whitespace if length > 0. | ||
| Find the longest common leading whitespace among a list of lines. | ||
| Whitespace-only lines are ignored. | ||
| Returns the margin length (>= 0). | ||
| */ | ||
| static Py_ssize_t | ||
| search_longest_common_leading_whitespace( | ||
| const char *const src, | ||
| const char *const end, | ||
| const char **output) | ||
| { | ||
| // [_start, _start + _len) | ||
| // describes the current longest common leading whitespace | ||
| const char *_start = NULL; | ||
| Py_ssize_t _len = 0; | ||
|
|
||
| for (const char *iter = src; iter < end; ++iter) { | ||
| const char *line_start = iter; | ||
| const char *leading_whitespace_end = NULL; | ||
|
|
||
| // scan the whole line | ||
| while (iter < end && *iter != '\n') { | ||
| if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') { | ||
| /* `iter` points to the first non-whitespace character | ||
| in this line */ | ||
| if (iter == line_start) { | ||
| // some line has no indent, fast exit! | ||
| return 0; | ||
| } | ||
| leading_whitespace_end = iter; | ||
| } | ||
| ++iter; | ||
| } | ||
| search_longest_common_leading_whitespace(PyObject *lines, Py_ssize_t nlines) | ||
StanFromIreland marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| PyObject *smallest = NULL, *largest = NULL; | ||
| for (Py_ssize_t i = 0; i < nlines; i++) { | ||
| PyObject *line = PyList_GET_ITEM(lines, i); | ||
| Py_ssize_t linelen = PyUnicode_GET_LENGTH(line); | ||
|
|
||
| // if this line has all white space, skip it | ||
| if (!leading_whitespace_end) { | ||
| if (linelen == 0) { | ||
| continue; | ||
| } | ||
|
|
||
| if (!_start) { | ||
| // update the first leading whitespace | ||
| _start = line_start; | ||
| _len = leading_whitespace_end - line_start; | ||
| assert(_len > 0); | ||
| int kind = PyUnicode_KIND(line); | ||
| void *data = PyUnicode_DATA(line); | ||
| int all_ws = 1; | ||
| for (Py_ssize_t j = 0; j < linelen; j++) { | ||
| if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) { | ||
| all_ws = 0; | ||
| break; | ||
| } | ||
| } | ||
| if (all_ws) { | ||
| continue; | ||
| } | ||
| else { | ||
| /* We then compare with the current longest leading whitespace. | ||
|
|
||
| [line_start, leading_whitespace_end) is the leading | ||
| whitespace of this line, | ||
| if (smallest == NULL || PyObject_RichCompareBool(line, smallest, Py_LT)) { | ||
| smallest = line; | ||
| } | ||
| if (largest == NULL || PyObject_RichCompareBool(line, largest, Py_GT)) { | ||
| largest = line; | ||
| } | ||
| } | ||
|
|
||
| [_start, _start + _len) is the leading whitespace of the | ||
| current longest leading whitespace. */ | ||
| Py_ssize_t new_len = 0; | ||
| const char *_iter = _start, *line_iter = line_start; | ||
| if (smallest == NULL || largest == NULL) { | ||
| return 0; | ||
| } | ||
|
|
||
| while (_iter < _start + _len && line_iter < leading_whitespace_end | ||
| && *_iter == *line_iter) | ||
| { | ||
| ++_iter; | ||
| ++line_iter; | ||
| ++new_len; | ||
| } | ||
| Py_ssize_t margin = 0; | ||
| Py_ssize_t minlen = Py_MIN(PyUnicode_GET_LENGTH(smallest), | ||
| PyUnicode_GET_LENGTH(largest)); | ||
| int skind = PyUnicode_KIND(smallest); | ||
| int lkind = PyUnicode_KIND(largest); | ||
| const void *sdata = PyUnicode_DATA(smallest); | ||
| const void *ldata = PyUnicode_DATA(largest); | ||
|
|
||
| _len = new_len; | ||
| if (_len == 0) { | ||
| // No common things now, fast exit! | ||
| return 0; | ||
| } | ||
| while (margin < minlen) { | ||
| Py_UCS4 c1 = PyUnicode_READ(skind, sdata, margin); | ||
| Py_UCS4 c2 = PyUnicode_READ(lkind, ldata, margin); | ||
| if (c1 != c2 || !(c1 == ' ' || c1 == '\t')) { | ||
| break; | ||
| } | ||
| margin++; | ||
| } | ||
|
|
||
| assert(_len >= 0); | ||
| if (_len > 0) { | ||
| *output = _start; | ||
| } | ||
| return _len; | ||
| return margin; | ||
| } | ||
|
|
||
| /* Dedent a string. | ||
|
|
@@ -14395,74 +14377,58 @@ search_longest_common_leading_whitespace( | |
| PyObject * | ||
| _PyUnicode_Dedent(PyObject *unicode) | ||
| { | ||
| Py_ssize_t src_len = 0; | ||
| const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len); | ||
| if (!src) { | ||
| PyObject *sep = PyUnicode_FromString("\n"); | ||
| if (sep == NULL) { | ||
| return NULL; | ||
| } | ||
| assert(src_len >= 0); | ||
| if (src_len == 0) { | ||
| return Py_NewRef(unicode); | ||
| } | ||
|
|
||
| const char *const end = src + src_len; | ||
|
|
||
| // [whitespace_start, whitespace_start + whitespace_len) | ||
| // describes the current longest common leading whitespace | ||
| const char *whitespace_start = NULL; | ||
| Py_ssize_t whitespace_len = search_longest_common_leading_whitespace( | ||
| src, end, &whitespace_start); | ||
|
|
||
| if (whitespace_len == 0) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep the fast path. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't, we need to clear lines, see the tests. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we really need to do it? in general there is nothing to dedent, so it'll slow down normal cases. Even if the comment says that it's meant to match textwrap.dedent(), I don't think it's needed. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we want to respect the behavior of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But I don't think we need to! It's an internal function. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But our documentation says we do:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you would rather just remove the false claims, we can do: https://github.com/python/cpython/compare/main...StanFromIreland:remove-misleading-notes?expand=1 Though I think this may cause confusion in the future. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, but we can just say that we don't normalize whitespaces and only consider spaces and tabs. No one should ever write spaces with other space-like characters. Let's just amend the NEWS. As for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I am working on the PEP :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I opened #138620 with your alternative, though I still think fixing it is better. |
||
| return Py_NewRef(unicode); | ||
| PyObject *lines = PyUnicode_Split(unicode, sep, -1); | ||
| Py_DECREF(sep); | ||
| if (lines == NULL) { | ||
| return NULL; | ||
| } | ||
| Py_ssize_t nlines = PyList_GET_SIZE(lines); | ||
| Py_ssize_t margin = search_longest_common_leading_whitespace(lines, nlines); | ||
|
|
||
| // now we should trigger a dedent | ||
| char *dest = PyMem_Malloc(src_len); | ||
| if (!dest) { | ||
| PyErr_NoMemory(); | ||
| PyUnicodeWriter *writer = PyUnicodeWriter_Create(0); | ||
| if (writer == NULL) { | ||
| Py_DECREF(lines); | ||
| return NULL; | ||
| } | ||
| char *dest_iter = dest; | ||
|
|
||
| for (const char *iter = src; iter < end; ++iter) { | ||
| const char *line_start = iter; | ||
| bool in_leading_space = true; | ||
| for (Py_ssize_t i = 0; i < nlines; i++) { | ||
| PyObject *line = PyList_GET_ITEM(lines, i); | ||
| Py_ssize_t linelen = PyUnicode_GET_LENGTH(line); | ||
|
|
||
| // iterate over a line to find the end of a line | ||
| while (iter < end && *iter != '\n') { | ||
| if (in_leading_space && *iter != ' ' && *iter != '\t') { | ||
| in_leading_space = false; | ||
| int all_ws = 1; | ||
| int kind = PyUnicode_KIND(line); | ||
| void *data = PyUnicode_DATA(line); | ||
| for (Py_ssize_t j = 0; j < linelen; j++) { | ||
| if (!Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))) { | ||
| all_ws = 0; | ||
| break; | ||
| } | ||
| ++iter; | ||
| } | ||
|
|
||
| // invariant: *iter == '\n' or iter == end | ||
| bool append_newline = iter < end; | ||
|
|
||
| // if this line has all white space, write '\n' and continue | ||
| if (in_leading_space && append_newline) { | ||
| *dest_iter++ = '\n'; | ||
| continue; | ||
| if (!all_ws) { | ||
| Py_ssize_t start = Py_MIN(margin, linelen); | ||
| if (PyUnicodeWriter_WriteSubstring(writer, line, start, linelen) < 0) { | ||
| PyUnicodeWriter_Discard(writer); | ||
| Py_DECREF(lines); | ||
| return NULL; | ||
| } | ||
| } | ||
|
|
||
| /* copy [new_line_start + whitespace_len, iter) to buffer, then | ||
| conditionally append '\n' */ | ||
|
|
||
StanFromIreland marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Py_ssize_t new_line_len = iter - line_start - whitespace_len; | ||
| assert(new_line_len >= 0); | ||
| memcpy(dest_iter, line_start + whitespace_len, new_line_len); | ||
|
|
||
| dest_iter += new_line_len; | ||
|
|
||
| if (append_newline) { | ||
| *dest_iter++ = '\n'; | ||
| if (i < nlines - 1) { | ||
| if (PyUnicodeWriter_WriteChar(writer, '\n') < 0) { | ||
| PyUnicodeWriter_Discard(writer); | ||
| Py_DECREF(lines); | ||
| return NULL; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest); | ||
| PyMem_Free(dest); | ||
| return res; | ||
| Py_DECREF(lines); | ||
| return PyUnicodeWriter_Finish(writer); | ||
| } | ||
|
|
||
| static PyMethodDef unicode_methods[] = { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.