diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 756a7ccd506be4..130e1e1d01dec5 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -2621,6 +2621,50 @@ def test_regression_gh94675(self): p.terminate() p.join() + def test_fail(self): + self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3') + + def test_character_set_any(self): + # The union of complementary character sets matches any character + # and is equivalent to "(?s:.)". + s = '1x\n' + for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S': + with self.subTest(pattern=p): + self.assertEqual(re.findall(p, s), list(s)) + self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s) + + def test_character_set_none(self): + # Negation of the union of complementary character sets does not match + # any character. + s = '1x\n' + for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]': + with self.subTest(pattern=p): + self.assertIsNone(re.search(p, s)) + self.assertIsNone(re.search('(?s:.)' + p, s)) + + def check_interrupt(self, pattern, string, maxcount): + class Interrupt(Exception): + pass + p = re.compile(pattern) + for n in range(maxcount): + try: + p._fail_after(n, Interrupt) + p.match(string) + return n + except Interrupt: + pass + finally: + p._fail_after(-1, None) + + @unittest.skipUnless(hasattr(re.Pattern, '_fail_after'), 'requires debug build') + def test_memory_leaks(self): + self.check_interrupt(r'(.)*:', 'abc:', 100) + self.check_interrupt(r'([^:])*?:', 'abc:', 100) + self.check_interrupt(r'([^:])*+:', 'abc:', 100) + self.check_interrupt(r'(.){2,4}:', 'abc:', 100) + self.check_interrupt(r'([^:]){2,4}?:', 'abc:', 100) + self.check_interrupt(r'([^:]){2,4}+:', 'abc:', 100) + def get_debug_out(pat): with captured_stdout() as out: diff --git a/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst new file mode 100644 index 00000000000000..021b4ae2e100bc --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-14-22-25-49.gh-issue-67877.G9hw0w.rst @@ -0,0 +1,2 @@ +Fix memory leaks when :mod:`regular expression ` matching terminates +abruptly, either because of a signal or because memory allocation fails. diff --git a/Modules/_sre/clinic/sre.c.h b/Modules/_sre/clinic/sre.c.h index 529c634e76d63c..56a4e6048fa8ef 100644 --- a/Modules/_sre/clinic/sre.c.h +++ b/Modules/_sre/clinic/sre.c.h @@ -975,6 +975,44 @@ PyDoc_STRVAR(_sre_SRE_Pattern___deepcopy____doc__, #define _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF \ {"__deepcopy__", (PyCFunction)_sre_SRE_Pattern___deepcopy__, METH_O, _sre_SRE_Pattern___deepcopy____doc__}, +#if defined(Py_DEBUG) + +PyDoc_STRVAR(_sre_SRE_Pattern__fail_after__doc__, +"_fail_after($self, count, exception, /)\n" +"--\n" +"\n" +"For debugging."); + +#define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF \ + {"_fail_after", _PyCFunction_CAST(_sre_SRE_Pattern__fail_after), METH_FASTCALL, _sre_SRE_Pattern__fail_after__doc__}, + +static PyObject * +_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count, + PyObject *exception); + +static PyObject * +_sre_SRE_Pattern__fail_after(PatternObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + int count; + PyObject *exception; + + if (!_PyArg_CheckPositional("_fail_after", nargs, 2, 2)) { + goto exit; + } + count = _PyLong_AsInt(args[0]); + if (count == -1 && PyErr_Occurred()) { + goto exit; + } + exception = args[1]; + return_value = _sre_SRE_Pattern__fail_after_impl(self, count, exception); + +exit: + return return_value; +} + +#endif /* defined(Py_DEBUG) */ + PyDoc_STRVAR(_sre_compile__doc__, "compile($module, /, pattern, flags, code, groups, groupindex,\n" " indexgroup)\n" @@ -1460,4 +1498,8 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyTypeObject *cls, PyObject *const } return _sre_SRE_Scanner_search_impl(self, cls); } -/*[clinic end generated code: output=045de53cfe02dee0 input=a9049054013a1b77]*/ + +#ifndef _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF + #define _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF +#endif /* !defined(_SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF) */ +/*[clinic end generated code: output=2165ecf43a7c20e8 input=a9049054013a1b77]*/ diff --git a/Modules/_sre/sre.c b/Modules/_sre/sre.c index 6d9843bb76d791..35c6058dac4eae 100644 --- a/Modules/_sre/sre.c +++ b/Modules/_sre/sre.c @@ -218,6 +218,85 @@ data_stack_grow(SRE_STATE* state, Py_ssize_t size) return 0; } +/* memory pool functions for SRE_REPEAT, this can avoid memory + leak when SRE(match) function terminates abruptly. + state->repeat_pool_used is a doubly-linked list, so that we + can remove a SRE_REPEAT node from it. + state->repeat_pool_unused is a singly-linked list, we put/get + node at the head. */ +static SRE_REPEAT * +repeat_pool_malloc(SRE_STATE *state) +{ + SRE_REPEAT *repeat; + + if (state->repeat_pool_unused) { + /* remove from unused pool (singly-linked list) */ + repeat = state->repeat_pool_unused; + state->repeat_pool_unused = repeat->pool_next; + } + else { + repeat = PyObject_Malloc(sizeof(SRE_REPEAT)); + if (!repeat) { + return NULL; + } + } + + /* add to used pool (doubly-linked list) */ + SRE_REPEAT *temp = state->repeat_pool_used; + if (temp) { + temp->pool_prev = repeat; + } + repeat->pool_prev = NULL; + repeat->pool_next = temp; + state->repeat_pool_used = repeat; + + return repeat; +} + +static void +repeat_pool_free(SRE_STATE *state, SRE_REPEAT *repeat) +{ + SRE_REPEAT *prev = repeat->pool_prev; + SRE_REPEAT *next = repeat->pool_next; + + /* remove from used pool (doubly-linked list) */ + if (prev) { + prev->pool_next = next; + } + else { + state->repeat_pool_used = next; + } + if (next) { + next->pool_prev = prev; + } + + /* add to unused pool (singly-linked list) */ + repeat->pool_next = state->repeat_pool_unused; + state->repeat_pool_unused = repeat; +} + +static void +repeat_pool_clear(SRE_STATE *state) +{ + /* clear used pool */ + SRE_REPEAT *next = state->repeat_pool_used; + state->repeat_pool_used = NULL; + while (next) { + SRE_REPEAT *temp = next; + next = temp->pool_next; + PyObject_Free(temp); + } + + /* clear unused pool */ + next = state->repeat_pool_unused; + state->repeat_pool_unused = NULL; + while (next) { + SRE_REPEAT *temp = next; + next = temp->pool_next; + PyObject_Free(temp); + } +} + /* generate 8-bit version */ #define SRE_CHAR Py_UCS1 @@ -463,6 +542,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, state->pos = start; state->endpos = end; +#ifdef Py_DEBUG + state->fail_after_count = pattern->fail_after_count; + state->fail_after_exc = pattern->fail_after_exc; // borrowed ref +#endif + return string; err: /* We add an explicit cast here because MSVC has a bug when @@ -485,6 +569,8 @@ state_fini(SRE_STATE* state) /* See above PyMem_Del for why we explicitly cast here. */ PyMem_Free((void*) state->mark); state->mark = NULL; + /* SRE_REPEAT pool */ + repeat_pool_clear(state); } /* calculate offset from start of string */ @@ -571,6 +657,9 @@ pattern_traverse(PatternObject *self, visitproc visit, void *arg) Py_VISIT(self->groupindex); Py_VISIT(self->indexgroup); Py_VISIT(self->pattern); +#ifdef Py_DEBUG + Py_VISIT(self->fail_after_exc); +#endif return 0; } @@ -580,6 +669,9 @@ pattern_clear(PatternObject *self) Py_CLEAR(self->groupindex); Py_CLEAR(self->indexgroup); Py_CLEAR(self->pattern); +#ifdef Py_DEBUG + Py_CLEAR(self->fail_after_exc); +#endif return 0; } @@ -642,7 +734,7 @@ _sre_SRE_Pattern_match_impl(PatternObject *self, PyTypeObject *cls, Py_ssize_t status; PyObject *match; - if (!state_init(&state, (PatternObject *)self, string, pos, endpos)) + if (!state_init(&state, self, string, pos, endpos)) return NULL; state.ptr = state.start; @@ -1330,6 +1422,29 @@ _sre_SRE_Pattern___deepcopy__(PatternObject *self, PyObject *memo) return Py_NewRef(self); } +#ifdef Py_DEBUG +/*[clinic input] +_sre.SRE_Pattern._fail_after + + count: int + exception: object + / + +For debugging. +[clinic start generated code]*/ + +static PyObject * +_sre_SRE_Pattern__fail_after_impl(PatternObject *self, int count, + PyObject *exception) +/*[clinic end generated code: output=9a6bf12135ac50c2 input=ef80a45c66c5499d]*/ +{ + self->fail_after_count = count; + Py_INCREF(exception); + Py_XSETREF(self->fail_after_exc, exception); + Py_RETURN_NONE; +} +#endif /* Py_DEBUG */ + static PyObject * pattern_repr(PatternObject *obj) { @@ -1456,6 +1571,10 @@ _sre_compile_impl(PyObject *module, PyObject *pattern, int flags, self->pattern = NULL; self->groupindex = NULL; self->indexgroup = NULL; +#ifdef Py_DEBUG + self->fail_after_count = -1; + self->fail_after_exc = NULL; +#endif self->codesize = n; @@ -2552,7 +2671,8 @@ pattern_new_match(_sremodulestate* module_state, if (!match) return NULL; - match->pattern = (PatternObject*)Py_NewRef(pattern); + Py_INCREF(pattern); + match->pattern = pattern; match->string = Py_NewRef(state->string); @@ -2688,7 +2808,7 @@ _sre_SRE_Scanner_match_impl(ScannerObject *self, PyTypeObject *cls) return NULL; } - match = pattern_new_match(module_state, (PatternObject*) self->pattern, + match = pattern_new_match(module_state, self->pattern, state, status); if (status == 0) @@ -2738,7 +2858,7 @@ _sre_SRE_Scanner_search_impl(ScannerObject *self, PyTypeObject *cls) return NULL; } - match = pattern_new_match(module_state, (PatternObject*) self->pattern, + match = pattern_new_match(module_state, self->pattern, state, status); if (status == 0) @@ -2774,7 +2894,8 @@ pattern_scanner(_sremodulestate *module_state, return NULL; } - scanner->pattern = Py_NewRef(self); + Py_INCREF(self); + scanner->pattern = self; PyObject_GC_Track(scanner); return (PyObject*) scanner; @@ -2968,6 +3089,7 @@ static PyMethodDef pattern_methods[] = { _SRE_SRE_PATTERN_SCANNER_METHODDEF _SRE_SRE_PATTERN___COPY___METHODDEF _SRE_SRE_PATTERN___DEEPCOPY___METHODDEF + _SRE_SRE_PATTERN__FAIL_AFTER_METHODDEF {"__class_getitem__", Py_GenericAlias, METH_O|METH_CLASS, PyDoc_STR("See PEP 585")}, {NULL, NULL} diff --git a/Modules/_sre/sre.h b/Modules/_sre/sre.h index a0f235606e290e..b8c6f8e3e660d1 100644 --- a/Modules/_sre/sre.h +++ b/Modules/_sre/sre.h @@ -34,6 +34,11 @@ typedef struct { int flags; /* flags used when compiling pattern source */ PyObject *weakreflist; /* List of weak references */ int isbytes; /* pattern type (1 - bytes, 0 - string, -1 - None) */ +#ifdef Py_DEBUG + /* for simulation of user interruption */ + int fail_after_count; + PyObject *fail_after_exc; +#endif /* pattern code */ Py_ssize_t codesize; SRE_CODE code[1]; @@ -68,6 +73,9 @@ typedef struct SRE_REPEAT_T { const SRE_CODE* pattern; /* points to REPEAT operator arguments */ const void* last_ptr; /* helper to check for infinite loops */ struct SRE_REPEAT_T *prev; /* points to previous repeat context */ + /* for SRE_REPEAT pool */ + struct SRE_REPEAT_T *pool_prev; + struct SRE_REPEAT_T *pool_next; } SRE_REPEAT; typedef struct { @@ -94,12 +102,19 @@ typedef struct { size_t data_stack_base; /* current repeat context */ SRE_REPEAT *repeat; + /* SRE_REPEAT pool */ + SRE_REPEAT *repeat_pool_used; + SRE_REPEAT *repeat_pool_unused; unsigned int sigcount; +#ifdef Py_DEBUG + int fail_after_count; + PyObject *fail_after_exc; +#endif } SRE_STATE; typedef struct { PyObject_HEAD - PyObject* pattern; + PatternObject* pattern; SRE_STATE state; int executing; } ScannerObject; diff --git a/Modules/_sre/sre_lib.h b/Modules/_sre/sre_lib.h index 95c1ada908d222..d82ba7aa3c8b83 100644 --- a/Modules/_sre/sre_lib.h +++ b/Modules/_sre/sre_lib.h @@ -524,13 +524,28 @@ typedef struct { Py_ssize_t last_ctx_pos; } SRE(match_context); -#define MAYBE_CHECK_SIGNALS \ +#define _MAYBE_CHECK_SIGNALS \ do { \ if ((0 == (++sigcount & 0xfff)) && PyErr_CheckSignals()) { \ RETURN_ERROR(SRE_ERROR_INTERRUPTED); \ } \ } while (0) +#ifdef Py_DEBUG +# define MAYBE_CHECK_SIGNALS \ + do { \ + _MAYBE_CHECK_SIGNALS; \ + if (state->fail_after_count >= 0) { \ + if (state->fail_after_count-- == 0) { \ + PyErr_SetNone(state->fail_after_exc); \ + RETURN_ERROR(SRE_ERROR_INTERRUPTED); \ + } \ + } \ + } while (0) +#else +# define MAYBE_CHECK_SIGNALS _MAYBE_CHECK_SIGNALS +#endif /* Py_DEBUG */ + #ifdef HAVE_COMPUTED_GOTOS #ifndef USE_COMPUTED_GOTOS #define USE_COMPUTED_GOTOS 1 @@ -1083,12 +1098,9 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) pattern[1], pattern[2])); /* install new repeat context */ - /* TODO(https://github.com/python/cpython/issues/67877): Fix this - * potential memory leak. */ - ctx->u.rep = (SRE_REPEAT*) PyObject_Malloc(sizeof(*ctx->u.rep)); + ctx->u.rep = repeat_pool_malloc(state); if (!ctx->u.rep) { - PyErr_NoMemory(); - RETURN_FAILURE; + RETURN_ERROR(SRE_ERROR_MEMORY); } ctx->u.rep->count = -1; ctx->u.rep->pattern = pattern; @@ -1099,7 +1111,7 @@ SRE(match)(SRE_STATE* state, const SRE_CODE* pattern, int toplevel) state->ptr = ptr; DO_JUMP(JUMP_REPEAT, jump_repeat, pattern+pattern[0]); state->repeat = ctx->u.rep->prev; - PyObject_Free(ctx->u.rep); + repeat_pool_free(state, ctx->u.rep); if (ret) { RETURN_ON_ERROR(ret);