-
-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Add efficient primitives for str.strip(). #18742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,58 @@ | |
| #include <Python.h> | ||
| #include "CPy.h" | ||
|
|
||
| // Copied from cpython.git:Objects/unicodeobject.c. | ||
| #define BLOOM_MASK unsigned long | ||
| #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) | ||
| #if LONG_BIT >= 128 | ||
| #define BLOOM_WIDTH 128 | ||
| #elif LONG_BIT >= 64 | ||
| #define BLOOM_WIDTH 64 | ||
| #elif LONG_BIT >= 32 | ||
| #define BLOOM_WIDTH 32 | ||
| #else | ||
| #error "LONG_BIT is smaller than 32" | ||
| #endif | ||
|
|
||
| // Copied from cpython.git:Objects/unicodeobject.c. This is needed for str.strip("..."). | ||
| static inline BLOOM_MASK | ||
| make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) | ||
| { | ||
| #define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ | ||
| do { \ | ||
| TYPE *data = (TYPE *)PTR; \ | ||
| TYPE *end = data + LEN; \ | ||
| Py_UCS4 ch; \ | ||
| for (; data != end; data++) { \ | ||
| ch = *data; \ | ||
| MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ | ||
| } \ | ||
| break; \ | ||
| } while (0) | ||
|
|
||
| /* calculate simple bloom-style bitmask for a given unicode string */ | ||
|
|
||
| BLOOM_MASK mask; | ||
|
|
||
| mask = 0; | ||
| switch (kind) { | ||
| case PyUnicode_1BYTE_KIND: | ||
| BLOOM_UPDATE(Py_UCS1, mask, ptr, len); | ||
| break; | ||
| case PyUnicode_2BYTE_KIND: | ||
| BLOOM_UPDATE(Py_UCS2, mask, ptr, len); | ||
| break; | ||
| case PyUnicode_4BYTE_KIND: | ||
| BLOOM_UPDATE(Py_UCS4, mask, ptr, len); | ||
| break; | ||
| default: | ||
| Py_UNREACHABLE(); | ||
| } | ||
| return mask; | ||
|
|
||
| #undef BLOOM_UPDATE | ||
| } | ||
|
|
||
| PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) { | ||
| if (PyUnicode_READY(str) != -1) { | ||
| if (CPyTagged_CheckShort(index)) { | ||
|
|
@@ -174,6 +226,116 @@ PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split) { | |
| return PyUnicode_RSplit(str, sep, temp_max_split); | ||
| } | ||
|
|
||
| // This function has been copied from _PyUnicode_XStrip in cpython.git:Objects/unicodeobject.c. | ||
| static PyObject *_PyStr_XStrip(PyObject *self, int striptype, PyObject *sepobj) { | ||
| const void *data; | ||
| int kind; | ||
| Py_ssize_t i, j, len; | ||
| BLOOM_MASK sepmask; | ||
| Py_ssize_t seplen; | ||
|
|
||
| kind = PyUnicode_KIND(self); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you need to call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
| data = PyUnicode_DATA(self); | ||
| len = PyUnicode_GET_LENGTH(self); | ||
| seplen = PyUnicode_GET_LENGTH(sepobj); | ||
| sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), | ||
| PyUnicode_DATA(sepobj), | ||
| seplen); | ||
|
|
||
| i = 0; | ||
| if (striptype != RIGHTSTRIP) { | ||
| while (i < len) { | ||
| Py_UCS4 ch = PyUnicode_READ(kind, data, i); | ||
| if (!BLOOM(sepmask, ch)) | ||
| break; | ||
| if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) | ||
| break; | ||
| i++; | ||
| } | ||
| } | ||
|
|
||
| j = len; | ||
| if (striptype != LEFTSTRIP) { | ||
| j--; | ||
| while (j >= i) { | ||
| Py_UCS4 ch = PyUnicode_READ(kind, data, j); | ||
| if (!BLOOM(sepmask, ch)) | ||
| break; | ||
| if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) | ||
| break; | ||
| j--; | ||
| } | ||
|
|
||
| j++; | ||
| } | ||
|
|
||
| return PyUnicode_Substring(self, i, j); | ||
| } | ||
|
|
||
| // Copied from do_strip function in cpython.git/Objects/unicodeobject.c. | ||
| PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep) { | ||
| if (sep == NULL || sep == Py_None) { | ||
| Py_ssize_t len, i, j; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to above, I think you'll need to call
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
|
|
||
| len = PyUnicode_GET_LENGTH(self); | ||
|
|
||
| if (PyUnicode_IS_ASCII(self)) { | ||
| const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); | ||
|
|
||
| i = 0; | ||
| if (strip_type != RIGHTSTRIP) { | ||
| while (i < len) { | ||
| Py_UCS1 ch = data[i]; | ||
| if (!_Py_ascii_whitespace[ch]) | ||
| break; | ||
| i++; | ||
| } | ||
| } | ||
|
|
||
| j = len; | ||
| if (strip_type != LEFTSTRIP) { | ||
| j--; | ||
| while (j >= i) { | ||
| Py_UCS1 ch = data[j]; | ||
| if (!_Py_ascii_whitespace[ch]) | ||
| break; | ||
| j--; | ||
| } | ||
| j++; | ||
| } | ||
| } | ||
| else { | ||
| int kind = PyUnicode_KIND(self); | ||
| const void *data = PyUnicode_DATA(self); | ||
|
|
||
| i = 0; | ||
| if (strip_type != RIGHTSTRIP) { | ||
| while (i < len) { | ||
| Py_UCS4 ch = PyUnicode_READ(kind, data, i); | ||
| if (!Py_UNICODE_ISSPACE(ch)) | ||
| break; | ||
| i++; | ||
| } | ||
| } | ||
|
|
||
| j = len; | ||
| if (strip_type != LEFTSTRIP) { | ||
| j--; | ||
| while (j >= i) { | ||
| Py_UCS4 ch = PyUnicode_READ(kind, data, j); | ||
| if (!Py_UNICODE_ISSPACE(ch)) | ||
| break; | ||
| j--; | ||
| } | ||
| j++; | ||
| } | ||
| } | ||
|
|
||
| return PyUnicode_Substring(self, i, j); | ||
| } | ||
| return _PyStr_XStrip(self, strip_type, sep); | ||
| } | ||
|
|
||
| PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr, | ||
| PyObject *new_substr, CPyTagged max_replace) { | ||
| Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -774,3 +774,16 @@ def test_surrogate() -> None: | |
| assert ord(f()) == 0xd800 | ||
| assert ord("\udfff") == 0xdfff | ||
| assert repr("foobar\x00\xab\ud912\U00012345") == r"'foobar\x00«\ud912𒍅'" | ||
|
|
||
| [case testStrip] | ||
| # This is a negative test. strip variants without args does not use efficient primitives. | ||
|
||
| def test_all_strips_default() -> None: | ||
| s = " a1\t" | ||
| assert s.lstrip() == "a1\t" | ||
| assert s.strip() == "a1" | ||
| assert s.rstrip() == " a1" | ||
| def test_all_strips() -> None: | ||
| s = "xxb2yy" | ||
| assert s.lstrip("xy") == "b2yy" | ||
| assert s.strip("xy") == "b2" | ||
| assert s.rstrip("xy") == "xxb2" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you test all string kinds and different character code ranges, such as these (and mixing these):
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please mention the Python version or commit date from where this is from.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.