|
5 | 5 | #include <Python.h> |
6 | 6 | #include "CPy.h" |
7 | 7 |
|
| 8 | +// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e. |
| 9 | +#define BLOOM_MASK unsigned long |
| 10 | +#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) |
| 11 | +#if LONG_BIT >= 128 |
| 12 | +#define BLOOM_WIDTH 128 |
| 13 | +#elif LONG_BIT >= 64 |
| 14 | +#define BLOOM_WIDTH 64 |
| 15 | +#elif LONG_BIT >= 32 |
| 16 | +#define BLOOM_WIDTH 32 |
| 17 | +#else |
| 18 | +#error "LONG_BIT is smaller than 32" |
| 19 | +#endif |
| 20 | + |
| 21 | +// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e. |
| 22 | +// This is needed for str.strip("..."). |
| 23 | +static inline BLOOM_MASK |
| 24 | +make_bloom_mask(int kind, const void* ptr, Py_ssize_t len) |
| 25 | +{ |
| 26 | +#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \ |
| 27 | + do { \ |
| 28 | + TYPE *data = (TYPE *)PTR; \ |
| 29 | + TYPE *end = data + LEN; \ |
| 30 | + Py_UCS4 ch; \ |
| 31 | + for (; data != end; data++) { \ |
| 32 | + ch = *data; \ |
| 33 | + MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \ |
| 34 | + } \ |
| 35 | + break; \ |
| 36 | + } while (0) |
| 37 | + |
| 38 | + /* calculate simple bloom-style bitmask for a given unicode string */ |
| 39 | + |
| 40 | + BLOOM_MASK mask; |
| 41 | + |
| 42 | + mask = 0; |
| 43 | + switch (kind) { |
| 44 | + case PyUnicode_1BYTE_KIND: |
| 45 | + BLOOM_UPDATE(Py_UCS1, mask, ptr, len); |
| 46 | + break; |
| 47 | + case PyUnicode_2BYTE_KIND: |
| 48 | + BLOOM_UPDATE(Py_UCS2, mask, ptr, len); |
| 49 | + break; |
| 50 | + case PyUnicode_4BYTE_KIND: |
| 51 | + BLOOM_UPDATE(Py_UCS4, mask, ptr, len); |
| 52 | + break; |
| 53 | + default: |
| 54 | + Py_UNREACHABLE(); |
| 55 | + } |
| 56 | + return mask; |
| 57 | + |
| 58 | +#undef BLOOM_UPDATE |
| 59 | +} |
| 60 | + |
8 | 61 | PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) { |
9 | 62 | if (PyUnicode_READY(str) != -1) { |
10 | 63 | if (CPyTagged_CheckShort(index)) { |
@@ -174,6 +227,124 @@ PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split) { |
174 | 227 | return PyUnicode_RSplit(str, sep, temp_max_split); |
175 | 228 | } |
176 | 229 |
|
| 230 | +// This function has been copied from _PyUnicode_XStrip in cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e. |
| 231 | +static PyObject *_PyStr_XStrip(PyObject *self, int striptype, PyObject *sepobj) { |
| 232 | + const void *data; |
| 233 | + int kind; |
| 234 | + Py_ssize_t i, j, len; |
| 235 | + BLOOM_MASK sepmask; |
| 236 | + Py_ssize_t seplen; |
| 237 | + |
| 238 | + // This check is needed from Python 3.9 and earlier. |
| 239 | + if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) |
| 240 | + return NULL; |
| 241 | + |
| 242 | + kind = PyUnicode_KIND(self); |
| 243 | + data = PyUnicode_DATA(self); |
| 244 | + len = PyUnicode_GET_LENGTH(self); |
| 245 | + seplen = PyUnicode_GET_LENGTH(sepobj); |
| 246 | + sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), |
| 247 | + PyUnicode_DATA(sepobj), |
| 248 | + seplen); |
| 249 | + |
| 250 | + i = 0; |
| 251 | + if (striptype != RIGHTSTRIP) { |
| 252 | + while (i < len) { |
| 253 | + Py_UCS4 ch = PyUnicode_READ(kind, data, i); |
| 254 | + if (!BLOOM(sepmask, ch)) |
| 255 | + break; |
| 256 | + if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) |
| 257 | + break; |
| 258 | + i++; |
| 259 | + } |
| 260 | + } |
| 261 | + |
| 262 | + j = len; |
| 263 | + if (striptype != LEFTSTRIP) { |
| 264 | + j--; |
| 265 | + while (j >= i) { |
| 266 | + Py_UCS4 ch = PyUnicode_READ(kind, data, j); |
| 267 | + if (!BLOOM(sepmask, ch)) |
| 268 | + break; |
| 269 | + if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0) |
| 270 | + break; |
| 271 | + j--; |
| 272 | + } |
| 273 | + |
| 274 | + j++; |
| 275 | + } |
| 276 | + |
| 277 | + return PyUnicode_Substring(self, i, j); |
| 278 | +} |
| 279 | + |
| 280 | +// Copied from do_strip function in cpython.git/Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e. |
| 281 | +PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep) { |
| 282 | + if (sep == NULL || sep == Py_None) { |
| 283 | + Py_ssize_t len, i, j; |
| 284 | + |
| 285 | + // This check is needed from Python 3.9 and earlier. |
| 286 | + if (PyUnicode_READY(self) == -1) |
| 287 | + return NULL; |
| 288 | + |
| 289 | + len = PyUnicode_GET_LENGTH(self); |
| 290 | + |
| 291 | + if (PyUnicode_IS_ASCII(self)) { |
| 292 | + const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self); |
| 293 | + |
| 294 | + i = 0; |
| 295 | + if (strip_type != RIGHTSTRIP) { |
| 296 | + while (i < len) { |
| 297 | + Py_UCS1 ch = data[i]; |
| 298 | + if (!_Py_ascii_whitespace[ch]) |
| 299 | + break; |
| 300 | + i++; |
| 301 | + } |
| 302 | + } |
| 303 | + |
| 304 | + j = len; |
| 305 | + if (strip_type != LEFTSTRIP) { |
| 306 | + j--; |
| 307 | + while (j >= i) { |
| 308 | + Py_UCS1 ch = data[j]; |
| 309 | + if (!_Py_ascii_whitespace[ch]) |
| 310 | + break; |
| 311 | + j--; |
| 312 | + } |
| 313 | + j++; |
| 314 | + } |
| 315 | + } |
| 316 | + else { |
| 317 | + int kind = PyUnicode_KIND(self); |
| 318 | + const void *data = PyUnicode_DATA(self); |
| 319 | + |
| 320 | + i = 0; |
| 321 | + if (strip_type != RIGHTSTRIP) { |
| 322 | + while (i < len) { |
| 323 | + Py_UCS4 ch = PyUnicode_READ(kind, data, i); |
| 324 | + if (!Py_UNICODE_ISSPACE(ch)) |
| 325 | + break; |
| 326 | + i++; |
| 327 | + } |
| 328 | + } |
| 329 | + |
| 330 | + j = len; |
| 331 | + if (strip_type != LEFTSTRIP) { |
| 332 | + j--; |
| 333 | + while (j >= i) { |
| 334 | + Py_UCS4 ch = PyUnicode_READ(kind, data, j); |
| 335 | + if (!Py_UNICODE_ISSPACE(ch)) |
| 336 | + break; |
| 337 | + j--; |
| 338 | + } |
| 339 | + j++; |
| 340 | + } |
| 341 | + } |
| 342 | + |
| 343 | + return PyUnicode_Substring(self, i, j); |
| 344 | + } |
| 345 | + return _PyStr_XStrip(self, strip_type, sep); |
| 346 | +} |
| 347 | + |
177 | 348 | PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr, |
178 | 349 | PyObject *new_substr, CPyTagged max_replace) { |
179 | 350 | Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace); |
|
0 commit comments