Skip to content

Commit 1c2c4d4

Browse files
<regex>: Process greedy simple loops non-recursively (#5790)
Co-authored-by: Stephan T. Lavavej <[email protected]>
1 parent 7f60cfb commit 1c2c4d4

File tree

2 files changed

+124
-91
lines changed

2 files changed

+124
-91
lines changed

stl/inc/regex

Lines changed: 43 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1681,6 +1681,7 @@ enum class _Rx_unwind_ops {
16811681
_Disjunction_eval_alt_always,
16821682
_Do_nothing,
16831683
_Loop_simple_nongreedy,
1684+
_Loop_simple_greedy,
16841685
};
16851686

16861687
template <class _BidIt>
@@ -1815,7 +1816,6 @@ private:
18151816
void _Decrease_stack_usage_count();
18161817
void _Increase_complexity_count();
18171818

1818-
bool _Do_rep0(_Node_rep*);
18191819
bool _Do_rep(_Node_rep*, bool, int);
18201820
void _Prepare_rep(_Node_rep*);
18211821
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
@@ -3413,72 +3413,6 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun
34133413
}
34143414
}
34153415

3416-
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
3417-
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node) {
3418-
// apply repetition to loop with no nested if/do
3419-
int _Ix = _Node->_Min;
3420-
const size_t _Frame_idx = _Loop_vals[_Node->_Loop_number]._Loop_frame_idx;
3421-
_Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 2;
3422-
3423-
_Tgt_state_t<_It> _Final;
3424-
bool _Matched0 = false;
3425-
_It _Saved_pos = _Tgt_state._Cur;
3426-
bool _Done = false;
3427-
3428-
if (_Match_pat(_Node->_End_rep->_Next)) {
3429-
// record an acceptable match and continue
3430-
_Final = _Tgt_state;
3431-
_Matched0 = true;
3432-
}
3433-
3434-
if (_Ix == 0 && _Node->_Max != 0) {
3435-
_Tgt_state._Cur = _Saved_pos;
3436-
_Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid;
3437-
3438-
if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done
3439-
_Done = true;
3440-
} else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions
3441-
_Done = true;
3442-
// we only potentially accept/try tail for POSIX
3443-
if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) {
3444-
return true; // go with current match
3445-
}
3446-
} else {
3447-
_Saved_pos = _Tgt_state._Cur;
3448-
if (_Match_pat(_Node->_End_rep->_Next)) {
3449-
// record match and continue
3450-
_Final = _Tgt_state;
3451-
_Matched0 = true;
3452-
}
3453-
}
3454-
_Ix = 1;
3455-
}
3456-
3457-
if (!_Done) {
3458-
while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
3459-
_Tgt_state._Cur = _Saved_pos;
3460-
_Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid;
3461-
if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) {
3462-
break; // rep match failed, quit loop
3463-
}
3464-
3465-
// since loop is branchless, empty rep match is not possible at this point
3466-
_Saved_pos = _Tgt_state._Cur;
3467-
if (_Match_pat(_Node->_End_rep->_Next)) {
3468-
// record match and continue
3469-
_Final = _Tgt_state;
3470-
_Matched0 = true;
3471-
}
3472-
}
3473-
}
3474-
3475-
if (_Matched0) { // record final match
3476-
_Tgt_state = _Final;
3477-
}
3478-
3479-
return _Matched0;
3480-
}
3481-
34823416
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
34833417
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) {
34843418
// apply repetition
@@ -4117,32 +4051,31 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41174051
break;
41184052

41194053
case _N_rep:
4120-
{
4054+
{ // handle start of loop
41214055
auto _Node = static_cast<_Node_rep*>(_Nx);
41224056
_Prepare_rep(_Node);
41234057
bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0;
41244058

41254059
if (_Node->_Simple_loop == 1) {
41264060
auto& _Sav = _Loop_vals[_Node->_Loop_number];
41274061
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing);
4128-
if (_Node->_Min > 0) { // try to match a rep
4129-
_Increase_complexity_count();
4062+
_Increase_complexity_count();
4063+
if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first
41304064
_Sav._Loop_idx = 1;
41314065
// _Next is already assigned correctly for matching a rep
4132-
} else if (!_Greedy || _Longest) { // non-greedy matching
4133-
_Increase_complexity_count();
41344066

4135-
// try tail first
4067+
// set up stack unwinding for greedy matching if no rep is allowed
4068+
if (_Node->_Min == 0) {
4069+
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Node);
4070+
}
4071+
} else { // try tail first
41364072
_Sav._Loop_idx = 0;
41374073
_Next = _Node->_End_rep->_Next;
41384074

41394075
// set up stack unwinding for non-greedy matching if at least one rep is allowed
41404076
if (_Node->_Max != 0) {
41414077
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node);
41424078
}
4143-
} else {
4144-
_Failed = !_Do_rep0(_Node);
4145-
_Next = nullptr;
41464079
}
41474080
} else {
41484081
_Failed = !_Do_rep(_Node, _Greedy, 0);
@@ -4153,7 +4086,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41534086
break;
41544087

41554088
case _N_end_rep:
4156-
{
4089+
{ // handle end of loop
41574090
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
41584091
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
41594092
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
@@ -4163,31 +4096,36 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41634096
== _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // initial match empty
41644097
// loop is branchless, so it will only ever match empty strings
41654098
// -> we only try tail for POSIX or if minimum number of reps is non-zero
4166-
if ((_Sflags & regex_constants::_Any_posix) || _Nr->_Min > 0) {
4167-
_Increase_complexity_count();
4168-
// _Next is already assigned correctly for matching tail
4169-
} else {
4099+
// _Next is already assigned correctly for matching tail
4100+
4101+
if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
41704102
_Failed = true;
41714103
}
41724104
} else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
4173-
_Increase_complexity_count();
4174-
41754105
_Next = _Nr->_Next;
41764106
// GH-5365: We have to reset the capture groups from the second iteration on.
41774107
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
41784108
++_Sav._Loop_idx;
4179-
} else if (_Longest || !_Greedy) {
4180-
_Increase_complexity_count();
4109+
} else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next
4110+
// set up stack unwinding for greedy matching
4111+
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Nr);
4112+
4113+
_Next = _Nr->_Next;
4114+
// GH-5365: We have to reset the capture groups from the second iteration on.
4115+
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
4116+
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
4117+
++_Sav._Loop_idx;
4118+
}
4119+
} else { // non-greedy matching or greedy matching with maximum reached
41814120
// set up stack unwinding for non-greedy matching if one more rep is allowed
41824121
if (_Sav._Loop_idx != _Nr->_Max) {
41834122
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr);
41844123
}
41854124
// _Next is already assigned correctly for matching tail
4186-
} else if (_Sav._Loop_idx == _Nr->_Min) { // greedy and minimum number of reps reached
4187-
_Failed = !_Do_rep0(_Nr);
4188-
_Next = nullptr;
4189-
} else { // internal _Match_pat(_Node->_Next) call in _Do_rep0()
4190-
_Next = nullptr;
4125+
}
4126+
4127+
if (!_Failed) {
4128+
_Increase_complexity_count();
41914129
}
41924130
} else {
41934131
_Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx);
@@ -4297,6 +4235,20 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
42974235
}
42984236
break;
42994237

4238+
case _Rx_unwind_ops::_Loop_simple_greedy:
4239+
// try tail if matching one more rep failed
4240+
if (_Failed) {
4241+
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
4242+
auto& _Sav = _Loop_vals[_Node->_Loop_number];
4243+
4244+
_Increase_complexity_count();
4245+
_Nx = _Node->_End_rep->_Next;
4246+
_Tgt_state._Cur = _Frame._Match_state._Cur;
4247+
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
4248+
_Failed = false;
4249+
}
4250+
break;
4251+
43004252
default:
43014253
#if _ITERATOR_DEBUG_LEVEL != 0
43024254
_STL_REPORT_ERROR("internal stack of regex matcher corrupted");
@@ -5299,7 +5251,7 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
52995251
break;
53005252
case _N_rep:
53015253
// _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once
5302-
// because _Matcher3::_Do_rep0() does not reset capture group boundaries when control is returned to it.
5254+
// because the matcher does not reset capture group boundaries when handling simple loops.
53035255
// If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
53045256
if (_Outer_rep) {
53055257
_Outer_rep->_Simple_loop = 0;

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2133,6 +2133,7 @@ void test_gh_5774() {
21332133
// GH-5774: Process non-greedy and longest-mode simple loops non-recursively.
21342134
// This extends our test coverage on non-greedy simple loops with bounded number of repetitions.
21352135
g_regexTester.should_not_match("", "a+?");
2136+
g_regexTester.should_match("b", "a{0}?b");
21362137
g_regexTester.should_not_match("ab", "a{0}?b");
21372138
g_regexTester.should_match("ab", "a{0,1}?b");
21382139
g_regexTester.should_not_match("aab", "a{0,1}?b");
@@ -2143,6 +2144,85 @@ void test_gh_5774() {
21432144
g_regexTester.should_match("aaab", "a{1,3}?b");
21442145
}
21452146

2147+
void test_gh_5790() {
2148+
// GH-5790: Process greedy simple loops non-recursively.
2149+
// This extends our test coverage on (mainly greedy) simple loops.
2150+
g_regexTester.should_not_match("", "a+");
2151+
g_regexTester.should_match("b", "a{0}b");
2152+
g_regexTester.should_not_match("ab", "a{0}b");
2153+
g_regexTester.should_match("ab", "a{0,1}b");
2154+
g_regexTester.should_not_match("aab", "a{0,1}b");
2155+
g_regexTester.should_match("aab", "a{0,2}b");
2156+
g_regexTester.should_match("aab", "a{1,2}b");
2157+
g_regexTester.should_not_match("aab", "a{1}b");
2158+
g_regexTester.should_not_match("aaab", "a{1,2}b");
2159+
g_regexTester.should_match("aaab", "a{1,3}b");
2160+
2161+
// Check that greedy and non-greedy search find the appropriate match.
2162+
// For the following regexes, greedy and leftmost-longest search yield the same matches.
2163+
for (syntax_option_type options : {ECMAScript, extended}) {
2164+
{
2165+
test_regex greedy_a_star(&g_regexTester, "a*", options);
2166+
greedy_a_star.should_search_match("aaaaaaaaaa", "aaaaaaaaaa");
2167+
}
2168+
2169+
{
2170+
test_regex bounded_greedy_a_rep(&g_regexTester, "a{5}", options);
2171+
bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
2172+
}
2173+
2174+
{
2175+
test_regex upper_bounded_greedy_a_rep(&g_regexTester, "a{0,5}", options);
2176+
upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
2177+
}
2178+
2179+
{
2180+
test_regex lower_bounded_greedy_a_rep(&g_regexTester, "a{4,1000}", options);
2181+
lower_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaaaaaaa");
2182+
}
2183+
2184+
{
2185+
test_regex lower_and_upper_bounded_greedy_a_rep(&g_regexTester, "a{2,5}", options);
2186+
lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
2187+
}
2188+
2189+
{
2190+
test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}", options);
2191+
too_large_min_greedy_a_rep.should_search_fail("aaaaaaaaaa");
2192+
}
2193+
}
2194+
2195+
{
2196+
test_regex nongreedy_a_star(&g_regexTester, "a*?");
2197+
nongreedy_a_star.should_search_match("aaaaaaaaaa", "");
2198+
}
2199+
2200+
{
2201+
test_regex bounded_nongreedy_a_rep(&g_regexTester, "a{5}?");
2202+
bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
2203+
}
2204+
2205+
{
2206+
test_regex upper_bounded_nongreedy_a_rep(&g_regexTester, "a{0,5}?");
2207+
upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "");
2208+
}
2209+
2210+
{
2211+
test_regex lower_bounded_nongreedy_a_rep(&g_regexTester, "a{4,1000}?");
2212+
lower_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaa");
2213+
}
2214+
2215+
{
2216+
test_regex lower_and_upper_bounded_nongreedy_a_rep(&g_regexTester, "a{2,5}?");
2217+
lower_and_upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aa");
2218+
}
2219+
2220+
{
2221+
test_regex too_large_min_nongreedy_a_rep(&g_regexTester, "a{11,1000}?");
2222+
too_large_min_nongreedy_a_rep.should_search_fail("aaaaaaaaaa");
2223+
}
2224+
}
2225+
21462226
int main() {
21472227
test_dev10_449367_case_insensitivity_should_work();
21482228
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
@@ -2195,6 +2275,7 @@ int main() {
21952275
test_gh_5576();
21962276
test_gh_5672();
21972277
test_gh_5774();
2278+
test_gh_5790();
21982279

21992280
return g_regexTester.result();
22002281
}

0 commit comments

Comments
 (0)