diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index c8acde8b57dcdb..793f59cb3ca99e 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1724,8 +1724,13 @@ expression support in the :mod:`re` module). .. method:: str.find(sub[, start[, end]]) Return the lowest index in the string where substring *sub* is found within - the slice ``s[start:end]``. Optional arguments *start* and *end* are - interpreted as in slice notation. Return ``-1`` if *sub* is not found. + the slice ``s[start:end]``. *sub* can also be a tuple of substrings to look + for. In this case the returned index, if found, will be the index of the + first match. Optional arguments *start* and *end* are interpreted as in + slice notation. Return ``-1`` if *sub* is not found. + + .. seealso:: + The :mod:`re` module, which provides advanced pattern matching. .. note:: @@ -1736,6 +1741,9 @@ expression support in the :mod:`re` module). >>> 'Py' in 'Python' True + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.format(*args, **kwargs) @@ -1789,6 +1797,9 @@ expression support in the :mod:`re` module). Like :meth:`~str.find`, but raise :exc:`ValueError` when the substring is not found. + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.isalnum() @@ -2030,8 +2041,16 @@ expression support in the :mod:`re` module). .. method:: str.rfind(sub[, start[, end]]) Return the highest index in the string where substring *sub* is found, such - that *sub* is contained within ``s[start:end]``. Optional arguments *start* - and *end* are interpreted as in slice notation. Return ``-1`` on failure. + that *sub* is contained within ``s[start:end]``. *sub* can also be a tuple + of substrings to look for. In this case the returned index, if found, will + be the index of the last match. Optional arguments *start* and *end* are + interpreted as in slice notation. Return ``-1`` on failure. + + .. seealso:: + The third-party :pypi:`regex` module, which provides advanced pattern matching. + + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. .. method:: str.rindex(sub[, start[, end]]) @@ -2039,6 +2058,9 @@ expression support in the :mod:`re` module). Like :meth:`rfind` but raises :exc:`ValueError` when the substring *sub* is not found. + .. versionchanged:: 3.14 + *sub* can now be a tuple of substrings. + .. method:: str.rjust(width[, fillchar]) @@ -2859,13 +2881,18 @@ arbitrary binary data. bytearray.find(sub[, start[, end]]) Return the lowest index in the data where the subsequence *sub* is found, - such that *sub* is contained in the slice ``s[start:end]``. Optional - arguments *start* and *end* are interpreted as in slice notation. Return - ``-1`` if *sub* is not found. + such that *sub* is contained in the slice ``s[start:end]``. *sub* can + also be a tuple of subsequences to look for. In this case the returned + index, if found, will be the index of the first match. Optional arguments + *start* and *end* are interpreted as in slice notation. Return ``-1`` if + *sub* is not found. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. + .. seealso:: + The :mod:`re` module, which provides advanced pattern matching. + .. note:: The :meth:`~bytes.find` method should be used only if you need to know the @@ -2878,6 +2905,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.index(sub[, start[, end]]) bytearray.index(sub[, start[, end]]) @@ -2891,6 +2921,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.join(iterable) bytearray.join(iterable) @@ -2947,16 +2980,24 @@ arbitrary binary data. bytearray.rfind(sub[, start[, end]]) Return the highest index in the sequence where the subsequence *sub* is - found, such that *sub* is contained within ``s[start:end]``. Optional - arguments *start* and *end* are interpreted as in slice notation. Return - ``-1`` on failure. + found, such that *sub* is contained within ``s[start:end]``. *sub* can + also be a tuple of subsequences to look for. In this case the returned + index, if found, will be the index of the last match. Optional arguments + *start* and *end* are interpreted as in slice notation. Return ``-1`` on + failure. The subsequence to search for may be any :term:`bytes-like object` or an integer in the range 0 to 255. + .. seealso:: + The third-party :pypi:`regex` module, which provides advanced pattern matching. + .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.rindex(sub[, start[, end]]) bytearray.rindex(sub[, start[, end]]) @@ -2970,6 +3011,9 @@ arbitrary binary data. .. versionchanged:: 3.3 Also accept an integer in the range 0 to 255 as the subsequence. + .. versionchanged:: 3.14 + *sub* can now be a tuple of subsequences. + .. method:: bytes.rpartition(sep) bytearray.rpartition(sep) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index b2dd80b64a691a..f219b4d2d7354b 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -75,7 +75,8 @@ New Features Other Language Changes ====================== - +* Support tuples for :class:`str`, :class:`bytes` and :class:`bytearray` methods ``find()``, ``index()``, ``rfind()`` and ``rindex()``. + (Contributed by Wannes Boeykens in ``gh-???``.) New Modules =========== diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 9bb0ce7bb57f8b..90defc0194c837 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -180,8 +180,12 @@ def test_find(self): if self.contains_bytes: self.checkequal(-1, 'hello', 'find', 42) + self.checkequal(-1, 'hello', 'find', (42,)) + self.checkequal(-1, 'hello', 'find', (42, 47)) else: self.checkraises(TypeError, 'hello', 'find', 42) + self.checkraises(TypeError, 'hello', 'find', (42,)) + self.checkraises(TypeError, 'hello', 'find', (42, 47)) self.checkequal(0, '', 'find', '') self.checkequal(-1, '', 'find', '', 1, 1) @@ -217,6 +221,34 @@ def test_find(self): if loc != -1: self.assertEqual(i[loc:loc+len(j)], j) + # test tuple arguments + MIN = 32 # FIND_MIN_CHUNK_SIZE + self.checkequal(-1, '__aa__bb__', 'find', ()) + self.checkequal(2, '__aa__bb__', 'find', ('aa',)) + self.checkequal(-1, '__aa__bb__', 'find', ('cc',)) + self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 10, 0) + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb')) + self.checkequal(2, '__aa__bb__', 'find', ('bb', 'aa')) + self.checkequal(-1, '__aa__bb__', 'find', ('cc', 'dd')) + self.checkequal(6, '__aa__bb__', 'find', ('aa', 'bb'), 3) + self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'cc'), 3) + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 10) + self.checkequal(-1, '__aa__bb__', 'find', ('aa', 'bb'), 0, 3) + self.checkequal(2, '__aa__bb__', 'find', ('aa', 'bb'), 0, 4) + self.checkraises(TypeError, 'hello', 'find', (1.0, 2.0)) + self.checkraises(TypeError, 'hello', 'find', (1.0, 2.0), 5, 0) + s = '_' * (MIN - 2) + 'aaaa' + '_' * (MIN - 2) + self.checkequal((MIN - 2), s, 'find', ('aaaa', 'bb')) + self.checkequal(2, 'foobar', 'find', ('ob', 'oba')) + self.checkequal(1, 'foobar', 'find', ('ob', 'oob')) + self.checkequal(0, '', 'find', ('', 'a')) + self.checkequal(2, '__abcd__', 'find', ('cd', 'ab')) + self.checkequal(2, '__abc__', 'find', ('bc', 'ab')) + self.checkequal(1, 'a' + 'b' * MIN, 'find', ('b' * MIN, 'c')) + s = 'ab' + 'c' * (10 * MIN) + self.checkequal(1, s, 'find', ('c' * (10 * MIN), 'b' + 'c' * (10 * MIN))) + self.checkequal(0, 'foobar', 'find', ('foo', 'bar')) + def test_rfind(self): self.checkequal(9, 'abcdefghiabc', 'rfind', 'abc') self.checkequal(12, 'abcdefghiabc', 'rfind', '') @@ -238,8 +270,12 @@ def test_rfind(self): if self.contains_bytes: self.checkequal(-1, 'hello', 'rfind', 42) + self.checkequal(-1, 'hello', 'rfind', (42,)) + self.checkequal(-1, 'hello', 'rfind', (42, 47)) else: self.checkraises(TypeError, 'hello', 'rfind', 42) + self.checkraises(TypeError, 'hello', 'rfind', (42,)) + self.checkraises(TypeError, 'hello', 'rfind', (42, 47)) # For a variety of combinations, # verify that str.rfind() matches __contains__ @@ -270,6 +306,34 @@ def test_rfind(self): # issue #15534 self.checkequal(0, '<......\u043c...', "rfind", "<") + # test tuple arguments + MIN = 32 # FIND_MIN_CHUNK_SIZE + self.checkequal(-1, '__aa__bb__', 'rfind', ()) + self.checkequal(6, '__aa__bb__', 'rfind', ('bb',)) + self.checkequal(-1, '__aa__bb__', 'rfind', ('cc',)) + self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 10, 0) + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb')) + self.checkequal(6, '__aa__bb__', 'rfind', ('bb', 'aa')) + self.checkequal(-1, '__aa__bb__', 'rfind', ('cc', 'dd')) + self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'cc'), 3) + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 0, 10) + self.checkequal(-1, '__aa__bb__', 'rfind', ('aa', 'bb'), 7, 10) + self.checkequal(6, '__aa__bb__', 'rfind', ('aa', 'bb'), 6, 10) + self.checkraises(TypeError, 'hello', 'rfind', (1.0, 2.0)) + self.checkraises(TypeError, 'hello', 'rfind', (1.0, 2.0), 5, 0) + s = '_' * (MIN - 2) + 'aaaa' + '_' * (MIN - 2) + self.checkequal((MIN - 2), s, 'rfind', ('aaaa', 'bb')) + self.checkequal(2, 'foobar', 'rfind', ('oba', 'ob')) + self.checkequal(2, 'foobar', 'rfind', ('oob', 'ob')) + self.checkequal(0, '', 'rfind', ('', 'a')) + self.checkequal(4, '__abcd__', 'rfind', ('ab', 'cd')) + self.checkequal(3, '__abc__', 'rfind', ('ab', 'bc')) + self.checkequal(0, 'b' * MIN + 'a', 'rfind', ('b' * MIN, 'c')) + s = 'ab' + 'c' * (10 * MIN) + self.checkequal(2, s, 'rfind', ('c' * (10 * MIN), 'b' + 'c' * (10 * MIN))) + self.checkequal(3, 'foo', 'rfind', ('', 'foo')) + self.checkequal(-1, 'foo', 'rfind', ('foobar',)) + def test_index(self): self.checkequal(0, 'abcdefghiabc', 'index', '') self.checkequal(3, 'abcdefghiabc', 'index', 'def') @@ -292,8 +356,17 @@ def test_index(self): if self.contains_bytes: self.checkraises(ValueError, 'hello', 'index', 42) + self.checkraises(ValueError, 'hello', 'index', (42,)) + self.checkraises(ValueError, 'hello', 'index', (42, 47)) else: self.checkraises(TypeError, 'hello', 'index', 42) + self.checkraises(TypeError, 'hello', 'index', (42,)) + self.checkraises(TypeError, 'hello', 'index', (42, 47)) + + # test tuple arguments (should be wrapper around find) + self.checkequal(2, '__aa__bb__', 'index', ('aa', 'bb')) + self.checkequal(2, '__aa__bb__', 'index', ('aa', 'bb')) + self.checkraises(ValueError, '__aa__bb__', 'index', ('cc', 'dd')) def test_rindex(self): self.checkequal(12, 'abcdefghiabc', 'rindex', '') @@ -318,8 +391,17 @@ def test_rindex(self): if self.contains_bytes: self.checkraises(ValueError, 'hello', 'rindex', 42) + self.checkraises(ValueError, 'hello', 'rindex', (42,)) + self.checkraises(ValueError, 'hello', 'rindex', (42, 47)) else: self.checkraises(TypeError, 'hello', 'rindex', 42) + self.checkraises(TypeError, 'hello', 'rindex', (42,)) + self.checkraises(TypeError, 'hello', 'rindex', (42, 47)) + + # test tuple arguments (should be wrapper around rfind) + self.checkequal(6, '__aa__bb__', 'rindex', ('aa', 'bb')) + self.checkequal(6, '__aa__bb__', 'rindex', ('bb', 'aa')) + self.checkraises(ValueError, '__aa__bb__', 'rindex', ('cc', 'dd')) def test_find_periodic_pattern(self): """Cover the special path for periodic patterns.""" diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 9e1985bb3a7639..abcffc033f3b86 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -644,6 +644,12 @@ def test_find(self): ValueError, r'byte must be in range\(0, 256\)', b.find, index) + # test tuple arguments + self.assertEqual(b.find((i,)), 1) + self.assertEqual(b.find((w,)), -1) + self.assertEqual(b.find((i, w)), 1) + self.assertEqual(b.find((w, i)), 1) + def test_rfind(self): b = self.type2test(b'mississippi') i = 105 @@ -663,6 +669,12 @@ def test_rfind(self): self.assertEqual(b.rfind(i, 3, 9), 7) self.assertEqual(b.rfind(w, 1, 3), -1) + # test tuple arguments + self.assertEqual(b.rfind((i,)), 10) + self.assertEqual(b.rfind((w,)), -1) + self.assertEqual(b.rfind((i, w)), 10) + self.assertEqual(b.rfind((w, i)), 10) + def test_index(self): b = self.type2test(b'mississippi') i = 105 diff --git a/Lib/test/test_inspect/test_inspect.py b/Lib/test/test_inspect/test_inspect.py index 011d42f34b6461..3f0536f10608a8 100644 --- a/Lib/test/test_inspect/test_inspect.py +++ b/Lib/test/test_inspect/test_inspect.py @@ -5414,7 +5414,7 @@ def test_builtins_have_signatures(self): 'dict': {'pop'}, 'int': {'__round__'}, 'memoryview': {'cast', 'hex'}, - 'str': {'count', 'endswith', 'find', 'index', 'maketrans', 'rfind', 'rindex', 'startswith'}, + 'str': {'count', 'endswith', 'maketrans', 'startswith'}, } self._test_module_has_signatures(builtins, no_signature, unsupported_signature, @@ -5589,7 +5589,7 @@ def test_typing_module_has_signatures(self): 'Generic': {'__class_getitem__', '__init_subclass__'}, } methods_unsupported_signature = { - 'Text': {'count', 'find', 'index', 'rfind', 'rindex', 'startswith', 'endswith', 'maketrans'}, + 'Text': {'count', 'startswith', 'endswith', 'maketrans'}, } self._test_module_has_signatures(typing, no_signature, methods_no_signature=methods_no_signature, diff --git a/Objects/bytes_methods.c b/Objects/bytes_methods.c index 981aa57164385e..350fab8954bbba 100644 --- a/Objects/bytes_methods.c +++ b/Objects/bytes_methods.c @@ -496,35 +496,16 @@ parse_args_finds_byte(const char *function_name, PyObject **subobj, char *byte) start = 0; \ } -Py_LOCAL_INLINE(Py_ssize_t) -find_internal(const char *str, Py_ssize_t len, - const char *function_name, PyObject *subobj, - Py_ssize_t start, Py_ssize_t end, - int dir) +static Py_ssize_t +_Py_fast_find_sub(const char *str, Py_ssize_t len, + const char *sub, Py_ssize_t sub_len, + Py_ssize_t start, Py_ssize_t end, + int dir) { - char byte; - Py_buffer subbuf; - const char *sub; - Py_ssize_t sub_len; Py_ssize_t res; - if (!parse_args_finds_byte(function_name, &subobj, &byte)) { - return -2; - } - - if (subobj) { - if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) - return -2; - - sub = subbuf.buf; - sub_len = subbuf.len; - } - else { - sub = &byte; - sub_len = 1; - } - - ADJUST_INDICES(start, end, len); + assert(start >= 0); + assert(end <= len); if (end - start < sub_len) res = -1; else if (sub_len == 1) { @@ -550,17 +531,211 @@ find_internal(const char *str, Py_ssize_t len, sub, sub_len, start, end); } + return res; +} + +static Py_ssize_t +_Py_find_sub(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end, + int dir) +{ + char byte; + Py_buffer subbuf; + const char *sub; + Py_ssize_t sub_len; + Py_ssize_t res; + + if (!parse_args_finds_byte(function_name, &subobj, &byte)) { + return -2; + } + + if (subobj) { + if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) + return -2; + + sub = subbuf.buf; + sub_len = subbuf.len; + } + else { + sub = &byte; + sub_len = 1; + } + + ADJUST_INDICES(start, end, len); + res = _Py_fast_find_sub(str, len, sub, sub_len, start, end, dir); + + if (subobj) + PyBuffer_Release(&subbuf); + + return res; +} + +static Py_ssize_t +_Py_chunk_find_sub(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t chunk_start, Py_ssize_t chunk_end, + Py_ssize_t end, int direction) +{ + char byte; + Py_buffer subbuf; + const char *sub; + Py_ssize_t sub_len; + Py_ssize_t res; + + assert(chunk_end <= end); + if (!parse_args_finds_byte(function_name, &subobj, &byte)) { + return -2; + } + + if (subobj) { + if (PyObject_GetBuffer(subobj, &subbuf, PyBUF_SIMPLE) != 0) { + return -2; + } + sub = subbuf.buf; + sub_len = subbuf.len; + } + else { + sub = &byte; + sub_len = 1; + } + + if (chunk_end >= end - sub_len) { // Guard overflow + res = _Py_fast_find_sub(str, len, sub, sub_len, chunk_start, end, + direction); + } + else { + res = _Py_fast_find_sub(str, len, sub, sub_len, chunk_start, + chunk_end + sub_len, direction); + } + if (subobj) PyBuffer_Release(&subbuf); return res; } +#define FIND_MIN_CHUNK_SIZE 32 +#define FIND_MAX_CHUNK_SIZE 16384 +#define FIND_EXP_CHUNK_SIZE 2 + +static Py_ssize_t +_Py_find_subs(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end, + int direction) +{ + Py_ssize_t tuple_len, result, chunk_size; + + tuple_len = PyTuple_GET_SIZE(subobj); + if (tuple_len <= 1) { + if (tuple_len == 0) { + return -1; + } + PyObject *subseq = PyTuple_GET_ITEM(subobj, 0); + return _Py_find_sub(str, len, function_name, subseq, start, end, + direction); + } + assert(FIND_MIN_CHUNK_SIZE > 0); + assert(FIND_MAX_CHUNK_SIZE >= FIND_MIN_CHUNK_SIZE); + assert(FIND_EXP_CHUNK_SIZE >= 1); + result = -1; + chunk_size = FIND_MIN_CHUNK_SIZE; + ADJUST_INDICES(start, end, len); + if (direction > 0) { + Py_ssize_t chunk_start = start; + while (1) { + Py_ssize_t chunk_end; + if (chunk_start >= end - chunk_size) { // Guard overflow + chunk_end = end; + } + else { + chunk_end = chunk_start + chunk_size - 1; + } + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *subseq; + Py_ssize_t new_result; + + subseq = PyTuple_GET_ITEM(subobj, i); + new_result = _Py_chunk_find_sub(str, len, function_name, + subseq, chunk_start, chunk_end, + end, +1); + if (new_result != -1) { + if (new_result == -2 || new_result == chunk_start) { + return new_result; + } + chunk_end = new_result - 1; // Only allow earlier match + result = new_result; + } + } + if (result != -1 || chunk_end >= end) { + // Found match or searched entire range (guard overflow) + return result; + } + chunk_start = chunk_end + 1; + chunk_size *= FIND_EXP_CHUNK_SIZE; + if (chunk_size > FIND_MAX_CHUNK_SIZE) { + chunk_size = FIND_MAX_CHUNK_SIZE; + } + } + } + else { + Py_ssize_t chunk_end = end; + while (1) { + Py_ssize_t chunk_start = chunk_end - chunk_size + 1; + if (chunk_start - 1 <= start) { + chunk_start = start; + } + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *subseq; + Py_ssize_t new_result; + + subseq = PyTuple_GET_ITEM(subobj, i); + new_result = _Py_chunk_find_sub(str, len, function_name, + subseq, chunk_start, chunk_end, + end, -1); + if (new_result != -1) { + if (new_result == -2 || new_result == chunk_end) { + return new_result; + } + chunk_start = new_result + 1; // Only allow later match + result = new_result; + } + } + if (result != -1 || chunk_start <= start) { + // Found match or searched entire range + return result; + } + chunk_end = chunk_start - 1; + chunk_size *= FIND_EXP_CHUNK_SIZE; + if (chunk_size > FIND_MAX_CHUNK_SIZE) { + chunk_size = FIND_MAX_CHUNK_SIZE; + } + } + } +} + +Py_LOCAL_INLINE(Py_ssize_t) +find(const char *str, Py_ssize_t len, + const char *function_name, PyObject *subobj, + Py_ssize_t start, Py_ssize_t end, + int direction) +{ + if (PyTuple_Check(subobj)) { + return _Py_find_subs(str, len, function_name, subobj, start, end, + direction); + } + else { + return _Py_find_sub(str, len, function_name, subobj, start, end, + direction); + } +} + PyObject * _Py_bytes_find(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "find", sub, start, end, +1); + Py_ssize_t result = find(str, len, "find", sub, start, end, +1); if (result == -2) return NULL; return PyLong_FromSsize_t(result); @@ -570,7 +745,7 @@ PyObject * _Py_bytes_index(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "index", sub, start, end, +1); + Py_ssize_t result = find(str, len, "index", sub, start, end, +1); if (result == -2) return NULL; if (result == -1) { @@ -585,7 +760,7 @@ PyObject * _Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "rfind", sub, start, end, -1); + Py_ssize_t result = find(str, len, "rfind", sub, start, end, -1); if (result == -2) return NULL; return PyLong_FromSsize_t(result); @@ -595,7 +770,7 @@ PyObject * _Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *sub, Py_ssize_t start, Py_ssize_t end) { - Py_ssize_t result = find_internal(str, len, "rindex", sub, start, end, -1); + Py_ssize_t result = find(str, len, "rindex", sub, start, end, -1); if (result == -2) return NULL; if (result == -1) { diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 78e14b0021d006..700fb7dd9a487c 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -357,7 +357,7 @@ unicode_expandtabs(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyOb } PyDoc_STRVAR(unicode_find__doc__, -"find($self, sub[, start[, end]], /)\n" +"find($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -369,14 +369,14 @@ PyDoc_STRVAR(unicode_find__doc__, {"find", _PyCFunction_CAST(unicode_find), METH_FASTCALL, unicode_find__doc__}, static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -384,11 +384,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("find", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("find", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -402,7 +398,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_find_impl(str, substr, start, end); + _return_value = unicode_find_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -413,7 +409,7 @@ unicode_find(PyObject *str, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_index__doc__, -"index($self, sub[, start[, end]], /)\n" +"index($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -425,14 +421,14 @@ PyDoc_STRVAR(unicode_index__doc__, {"index", _PyCFunction_CAST(unicode_index), METH_FASTCALL, unicode_index__doc__}, static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -440,11 +436,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("index", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("index", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -458,7 +450,7 @@ unicode_index(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_index_impl(str, substr, start, end); + _return_value = unicode_index_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1060,7 +1052,7 @@ unicode_removesuffix(PyObject *self, PyObject *arg) } PyDoc_STRVAR(unicode_rfind__doc__, -"rfind($self, sub[, start[, end]], /)\n" +"rfind($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -1072,14 +1064,14 @@ PyDoc_STRVAR(unicode_rfind__doc__, {"rfind", _PyCFunction_CAST(unicode_rfind), METH_FASTCALL, unicode_rfind__doc__}, static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1087,11 +1079,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rfind", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("rfind", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1105,7 +1093,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rfind_impl(str, substr, start, end); + _return_value = unicode_rfind_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1116,7 +1104,7 @@ unicode_rfind(PyObject *str, PyObject *const *args, Py_ssize_t nargs) } PyDoc_STRVAR(unicode_rindex__doc__, -"rindex($self, sub[, start[, end]], /)\n" +"rindex($self, sub, start=None, end=None, /)\n" "--\n" "\n" "Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].\n" @@ -1128,14 +1116,14 @@ PyDoc_STRVAR(unicode_rindex__doc__, {"rindex", _PyCFunction_CAST(unicode_rindex), METH_FASTCALL, unicode_rindex__doc__}, static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end); static PyObject * unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - PyObject *substr; + PyObject *sub; Py_ssize_t start = 0; Py_ssize_t end = PY_SSIZE_T_MAX; Py_ssize_t _return_value; @@ -1143,11 +1131,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) if (!_PyArg_CheckPositional("rindex", nargs, 1, 3)) { goto exit; } - if (!PyUnicode_Check(args[0])) { - _PyArg_BadArgument("rindex", "argument 1", "str", args[0]); - goto exit; - } - substr = args[0]; + sub = args[0]; if (nargs < 2) { goto skip_optional; } @@ -1161,7 +1145,7 @@ unicode_rindex(PyObject *str, PyObject *const *args, Py_ssize_t nargs) goto exit; } skip_optional: - _return_value = unicode_rindex_impl(str, substr, start, end); + _return_value = unicode_rindex_impl(str, sub, start, end); if ((_return_value == -1) && PyErr_Occurred()) { goto exit; } @@ -1888,4 +1872,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=9fee62bd337f809b input=a9049054013a1b77]*/ +/*[clinic end generated code: output=fb38686a525c786a input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b0b4173408724..0227ea80879aa3 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9057,28 +9057,19 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) } static Py_ssize_t -any_find_slice(PyObject* s1, PyObject* s2, - Py_ssize_t start, - Py_ssize_t end, - int direction) +fast_find_sub(const void *buf1, int kind1, Py_ssize_t len1, + const void *buf2, int kind2, Py_ssize_t len2, + Py_ssize_t start, Py_ssize_t end, + int isascii, int direction) { - int kind1, kind2; - const void *buf1, *buf2; - Py_ssize_t len1, len2, result; - - kind1 = PyUnicode_KIND(s1); - kind2 = PyUnicode_KIND(s2); - if (kind1 < kind2) - return -1; + Py_ssize_t result; - len1 = PyUnicode_GET_LENGTH(s1); - len2 = PyUnicode_GET_LENGTH(s2); - ADJUST_INDICES(start, end, len1); + assert(kind2 <= kind1); + assert(start >= 0); + assert(end <= len1); if (end - start < len2) return -1; - buf1 = PyUnicode_DATA(s1); - buf2 = PyUnicode_DATA(s2); if (len2 == 1) { Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0); result = findchar((const char *)buf1 + kind1*start, @@ -9098,7 +9089,7 @@ any_find_slice(PyObject* s1, PyObject* s2, if (direction > 0) { switch (kind1) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) + if (isascii) result = asciilib_find_slice(buf1, len1, buf2, len2, start, end); else result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end); @@ -9116,7 +9107,7 @@ any_find_slice(PyObject* s1, PyObject* s2, else { switch (kind1) { case PyUnicode_1BYTE_KIND: - if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2)) + if (isascii) result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end); else result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end); @@ -9132,13 +9123,208 @@ any_find_slice(PyObject* s1, PyObject* s2, } } - assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2))); if (kind2 != kind1) PyMem_Free((void *)buf2); return result; } +static Py_ssize_t +find_sub(PyObject* s1, PyObject* s2, + Py_ssize_t start, Py_ssize_t end, + int direction) +{ + int kind1, kind2, isascii1, isascii2; + const void *buf1, *buf2; + Py_ssize_t len1, len2; + + kind1 = PyUnicode_KIND(s1); + kind2 = PyUnicode_KIND(s2); + if (kind1 < kind2) + return -1; + + isascii1 = PyUnicode_IS_ASCII(s1); + isascii2 = PyUnicode_IS_ASCII(s2); + if (!isascii2 && isascii1) + return -1; + + len1 = PyUnicode_GET_LENGTH(s1); + len2 = PyUnicode_GET_LENGTH(s2); + ADJUST_INDICES(start, end, len1); + + buf1 = PyUnicode_DATA(s1); + buf2 = PyUnicode_DATA(s2); + + return fast_find_sub(buf1, kind1, len1, buf2, kind2, len2, start, end, + isascii1, direction); +} + +static Py_ssize_t +chunk_find_sub(const void *buf1, int kind1, int isascii1, Py_ssize_t len1, + PyObject* s2, + Py_ssize_t chunk_start, Py_ssize_t chunk_end, + Py_ssize_t end, int direction) +{ + int kind2, isascii2; + const void *buf2; + Py_ssize_t len2; + + assert(chunk_end <= end); + kind2 = PyUnicode_KIND(s2); + if (kind1 < kind2) + return -1; + + isascii2 = PyUnicode_IS_ASCII(s2); + if (!isascii2 && isascii1) + return -1; + + len2 = PyUnicode_GET_LENGTH(s2); + buf2 = PyUnicode_DATA(s2); + + if (chunk_end >= end - len2) { // Guard overflow + return fast_find_sub(buf1, kind1, len1, buf2, kind2, len2, chunk_start, + end, isascii1, direction); + } + else { + return fast_find_sub(buf1, kind1, len1, buf2, kind2, len2, chunk_start, + chunk_end + len2, isascii1, direction); + } +} + +#define FIND_MIN_CHUNK_SIZE 32 +#define FIND_MAX_CHUNK_SIZE 16384 +#define FIND_EXP_CHUNK_SIZE 2 + +static Py_ssize_t +find_subs(PyObject *str, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end, + int direction) +{ + Py_ssize_t tuple_len, result, chunk_size, len1; + const void *buf1; + int kind1, isascii1; + + tuple_len = PyTuple_GET_SIZE(subobj); + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *substr = PyTuple_GET_ITEM(subobj, i); + if (!PyUnicode_Check(substr)) { + PyErr_Format(PyExc_TypeError, + "tuple for %.200s must only contain str, " + "not %.100s", function_name, + Py_TYPE(substr)->tp_name); + return -2; + } + } + if (tuple_len <= 1) { + if (tuple_len == 0) { + return -1; + } + PyObject *substr = PyTuple_GET_ITEM(subobj, 0); + return find_sub(str, substr, start, end, direction); + } + assert(FIND_MIN_CHUNK_SIZE > 0); + assert(FIND_MAX_CHUNK_SIZE >= FIND_MIN_CHUNK_SIZE); + assert(FIND_EXP_CHUNK_SIZE >= 1); + result = -1; + chunk_size = FIND_MIN_CHUNK_SIZE; + buf1 = PyUnicode_DATA(str); + kind1 = PyUnicode_KIND(str); + isascii1 = PyUnicode_IS_ASCII(str); + len1 = PyUnicode_GET_LENGTH(str); + ADJUST_INDICES(start, end, len1); + if (direction > 0) { + Py_ssize_t chunk_start = start; + while (1) { + Py_ssize_t chunk_end; + if (chunk_start >= end - chunk_size) { // Guard overflow + chunk_end = end; + } + else { + chunk_end = chunk_start + chunk_size - 1; + } + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *substr; + Py_ssize_t new_result; + + substr = PyTuple_GET_ITEM(subobj, i); + new_result = chunk_find_sub(buf1, kind1, isascii1, len1, + substr, chunk_start, chunk_end, + end, +1); + if (new_result != -1) { + if (new_result == -2 || new_result == chunk_start) { + return new_result; + } + chunk_end = new_result - 1; // Only allow earlier match + result = new_result; + } + } + if (result != -1 || chunk_end >= end) { + // Found match or searched entire range (guard overflow) + return result; + } + chunk_start = chunk_end + 1; + chunk_size *= FIND_EXP_CHUNK_SIZE; + if (chunk_size > FIND_MAX_CHUNK_SIZE) { + chunk_size = FIND_MAX_CHUNK_SIZE; + } + } + } + else { + Py_ssize_t chunk_end = end; + while (1) { + Py_ssize_t chunk_start = chunk_end - chunk_size + 1; + if (chunk_start - 1 <= start) { + chunk_start = start; + } + for (Py_ssize_t i = 0; i < tuple_len; i++) { + PyObject *substr; + Py_ssize_t new_result; + + substr = PyTuple_GET_ITEM(subobj, i); + new_result = chunk_find_sub(buf1, kind1, isascii1, len1, + substr, chunk_start, chunk_end, + end, -1); + if (new_result != -1) { + if (new_result == -2 || new_result == chunk_start) { + return new_result; + } + chunk_start = new_result + 1; // Only allow later match + result = new_result; + } + } + if (result != -1 || chunk_start <= start) { + // Found match or searched entire range + return result; + } + chunk_end = chunk_start - 1; + chunk_size *= FIND_EXP_CHUNK_SIZE; + if (chunk_size > FIND_MAX_CHUNK_SIZE) { + chunk_size = FIND_MAX_CHUNK_SIZE; + } + } + } +} + +static inline Py_ssize_t +find(PyObject *str, const char *function_name, + PyObject *subobj, Py_ssize_t start, Py_ssize_t end, + int direction) +{ + if (PyTuple_Check(subobj)) { + return find_subs(str, function_name, subobj, start, end, direction); + } + else if (!PyUnicode_Check(subobj)) { + PyErr_Format(PyExc_TypeError, + "%.200s first arg must be str or " + "a tuple of str, not %.100s", function_name, + Py_TYPE(subobj)->tp_name); + return -2; + } + else { + return find_sub(str, subobj, start, end, direction); + } +} + /* _PyUnicode_InsertThousandsGrouping() helper functions */ #include "stringlib/localeutil.h" @@ -9297,7 +9483,7 @@ PyUnicode_Find(PyObject *str, if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) return -2; - return any_find_slice(str, substr, start, end, direction); + return find_sub(str, substr, start, end, direction); } Py_ssize_t @@ -11337,7 +11523,13 @@ unicode_expandtabs_impl(PyObject *self, int tabsize) } /*[clinic input] -str.find as unicode_find = str.count +str.find as unicode_find -> Py_ssize_t + + self as str: self + sub: object + start: slice_index(accept={int, NoneType}, c_default='0') = None + end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None + / Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -11346,11 +11538,11 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_find_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=51dbe6255712e278 input=4a89d2d68ef57256]*/ +/*[clinic end generated code: output=da52b0913b08a960 input=a236fecd6e36a36a]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, 1); + Py_ssize_t result = find(str, "find", sub, start, end, +1); if (result < 0) { return -1; } @@ -11400,7 +11592,7 @@ unicode_hash(PyObject *self) } /*[clinic input] -str.index as unicode_index = str.count +str.index as unicode_index = str.find Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -11409,11 +11601,11 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_index_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=77558288837cdf40 input=d986aeac0be14a1c]*/ +/*[clinic end generated code: output=4f3129c11e833e01 input=f0033cf1698b6108]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, 1); + Py_ssize_t result = find(str, "index", sub, start, end, +1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); } @@ -12430,7 +12622,7 @@ unicode_repr(PyObject *unicode) } /*[clinic input] -str.rfind as unicode_rfind = str.count +str.rfind as unicode_rfind = str.find Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -12439,11 +12631,11 @@ Return -1 on failure. [clinic start generated code]*/ static Py_ssize_t -unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rfind_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=880b29f01dd014c8 input=898361fb71f59294]*/ +/*[clinic end generated code: output=0576fddc53b8616e input=23ae7964e8f70b35]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, -1); + Py_ssize_t result = find(str, "rfind", sub, start, end, -1); if (result < 0) { return -1; } @@ -12451,7 +12643,7 @@ unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start, } /*[clinic input] -str.rindex as unicode_rindex = str.count +str.rindex as unicode_rindex = str.find Return the highest index in S where substring sub is found, such that sub is contained within S[start:end]. @@ -12460,11 +12652,11 @@ Raises ValueError when the substring is not found. [clinic start generated code]*/ static Py_ssize_t -unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start, +unicode_rindex_impl(PyObject *str, PyObject *sub, Py_ssize_t start, Py_ssize_t end) -/*[clinic end generated code: output=5f3aef124c867fe1 input=35943dead6c1ea9d]*/ +/*[clinic end generated code: output=137ad2933d200f38 input=990f3925b149c1bc]*/ { - Py_ssize_t result = any_find_slice(str, substr, start, end, -1); + Py_ssize_t result = find(str, "rindex", sub, start, end, -1); if (result == -1) { PyErr_SetString(PyExc_ValueError, "substring not found"); }