Skip to content

Commit 4f63fa0

Browse files
authored
<regex>: Process non-greedy and longest-mode simple loops non-recursively (#5774)
1 parent 5f88083 commit 4f63fa0

File tree

2 files changed

+81
-36
lines changed

2 files changed

+81
-36
lines changed

stl/inc/regex

Lines changed: 66 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1586,7 +1586,7 @@ public:
15861586
_Loop_number(_Number) {}
15871587

15881588
const int _Min;
1589-
const int _Max;
1589+
const int _Max; // non-negative if bounded, -1 if unbounded
15901590
_Node_end_rep* _End_rep;
15911591
unsigned int _Loop_number;
15921592
int _Simple_loop = -1; // -1 undetermined, 0 contains if/do, 1 simple
@@ -1680,6 +1680,7 @@ enum class _Rx_unwind_ops {
16801680
_Disjunction_eval_alt_on_failure,
16811681
_Disjunction_eval_alt_always,
16821682
_Do_nothing,
1683+
_Loop_simple_nongreedy,
16831684
};
16841685

16851686
template <class _BidIt>
@@ -1814,7 +1815,7 @@ private:
18141815
void _Decrease_stack_usage_count();
18151816
void _Increase_complexity_count();
18161817

1817-
bool _Do_rep0(_Node_rep*, bool);
1818+
bool _Do_rep0(_Node_rep*);
18181819
bool _Do_rep(_Node_rep*, bool, int);
18191820
void _Prepare_rep(_Node_rep*);
18201821
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
@@ -3413,22 +3414,18 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun
34133414
}
34143415

34153416
template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
3416-
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node, bool _Greedy) {
3417+
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node) {
34173418
// apply repetition to loop with no nested if/do
34183419
int _Ix = _Node->_Min;
34193420
const size_t _Frame_idx = _Loop_vals[_Node->_Loop_number]._Loop_frame_idx;
3420-
_Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 1;
3421+
_Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 2;
34213422

34223423
_Tgt_state_t<_It> _Final;
34233424
bool _Matched0 = false;
34243425
_It _Saved_pos = _Tgt_state._Cur;
34253426
bool _Done = false;
34263427

34273428
if (_Match_pat(_Node->_End_rep->_Next)) {
3428-
if (!_Greedy) {
3429-
return true; // go with current match
3430-
}
3431-
34323429
// record an acceptable match and continue
34333430
_Final = _Tgt_state;
34343431
_Matched0 = true;
@@ -3449,10 +3446,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
34493446
} else {
34503447
_Saved_pos = _Tgt_state._Cur;
34513448
if (_Match_pat(_Node->_End_rep->_Next)) {
3452-
if (!_Greedy) {
3453-
return true; // go with current match
3454-
}
3455-
34563449
// record match and continue
34573450
_Final = _Tgt_state;
34583451
_Matched0 = true;
@@ -3472,10 +3465,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node
34723465
// since loop is branchless, empty rep match is not possible at this point
34733466
_Saved_pos = _Tgt_state._Cur;
34743467
if (_Match_pat(_Node->_End_rep->_Next)) {
3475-
if (!_Greedy) {
3476-
return true; // go with current match
3477-
}
3478-
34793468
// record match and continue
34803469
_Final = _Tgt_state;
34813470
_Matched0 = true;
@@ -4135,13 +4124,25 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41354124

41364125
if (_Node->_Simple_loop == 1) {
41374126
auto& _Sav = _Loop_vals[_Node->_Loop_number];
4138-
_Sav._Loop_idx = 1;
41394127
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing);
4140-
if (_Node->_Min == 0) {
4141-
_Failed = !_Do_rep0(_Node, _Greedy);
4142-
_Next = nullptr;
4143-
} else {
4128+
if (_Node->_Min > 0) { // try to match a rep
41444129
_Increase_complexity_count();
4130+
_Sav._Loop_idx = 1;
4131+
// _Next is already assigned correctly for matching a rep
4132+
} else if (!_Greedy || _Longest) { // non-greedy matching
4133+
_Increase_complexity_count();
4134+
4135+
// try tail first
4136+
_Sav._Loop_idx = 0;
4137+
_Next = _Node->_End_rep->_Next;
4138+
4139+
// set up stack unwinding for non-greedy matching if at least one rep is allowed
4140+
if (_Node->_Max != 0) {
4141+
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node);
4142+
}
4143+
} else {
4144+
_Failed = !_Do_rep0(_Node);
4145+
_Next = nullptr;
41454146
}
41464147
} else {
41474148
_Failed = !_Do_rep(_Node, _Greedy, 0);
@@ -4155,29 +4156,41 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
41554156
{
41564157
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
41574158
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
4159+
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
41584160
if (_Nr->_Simple_loop != 0) {
4159-
if (_Sav._Loop_idx <= _Nr->_Min) {
4160-
if (_Sav._Loop_idx == 1
4161-
&& _Tgt_state._Cur == _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // match empty
4162-
// loop is branchless, so it will only ever match empty strings
4163-
// -> skip all other matches as they don't change state and immediately try tail
4161+
if (_Sav._Loop_idx == 1
4162+
&& _Tgt_state._Cur
4163+
== _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // initial match empty
4164+
// loop is branchless, so it will only ever match empty strings
4165+
// -> we only try tail for POSIX or if minimum number of reps is non-zero
4166+
if ((_Sflags & regex_constants::_Any_posix) || _Nr->_Min > 0) {
41644167
_Increase_complexity_count();
41654168
// _Next is already assigned correctly for matching tail
4166-
} else if (_Sav._Loop_idx < _Nr->_Min) { // needs at least one more rep to reach minimum
4167-
_Increase_complexity_count();
4168-
// GH-5365: We have to reset the capture groups from the second iteration on.
4169-
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
4170-
_Next = _Nr->_Next;
4171-
++_Sav._Loop_idx;
4172-
} else { // minimum number of reps reached
4173-
_Failed = !_Do_rep0(_Nr, (_Nr->_Flags & _Fl_greedy) != 0);
4174-
_Next = nullptr;
4169+
} else {
4170+
_Failed = true;
41754171
}
4172+
} else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
4173+
_Increase_complexity_count();
4174+
4175+
_Next = _Nr->_Next;
4176+
// GH-5365: We have to reset the capture groups from the second iteration on.
4177+
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
4178+
++_Sav._Loop_idx;
4179+
} else if (_Longest || !_Greedy) {
4180+
_Increase_complexity_count();
4181+
// set up stack unwinding for non-greedy matching if one more rep is allowed
4182+
if (_Sav._Loop_idx != _Nr->_Max) {
4183+
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr);
4184+
}
4185+
// _Next is already assigned correctly for matching tail
4186+
} else if (_Sav._Loop_idx == _Nr->_Min) { // greedy and minimum number of reps reached
4187+
_Failed = !_Do_rep0(_Nr);
4188+
_Next = nullptr;
41764189
} else { // internal _Match_pat(_Node->_Next) call in _Do_rep0()
41774190
_Next = nullptr;
41784191
}
41794192
} else {
4180-
_Failed = !_Do_rep(_Nr, (_Nr->_Flags & _Fl_greedy) != 0, _Sav._Loop_idx);
4193+
_Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx);
41814194
_Next = nullptr;
41824195
}
41834196
break;
@@ -4267,6 +4280,23 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
42674280
case _Rx_unwind_ops::_Do_nothing:
42684281
break;
42694282

4283+
case _Rx_unwind_ops::_Loop_simple_nongreedy:
4284+
// try one more rep after matching tail if necessary
4285+
if (_Longest || _Failed) {
4286+
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
4287+
auto& _Sav = _Loop_vals[_Node->_Loop_number];
4288+
4289+
_Increase_complexity_count();
4290+
_Nx = _Node->_Next;
4291+
_Tgt_state._Cur = _Frame._Match_state._Cur;
4292+
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
4293+
_Failed = false;
4294+
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
4295+
++_Sav._Loop_idx;
4296+
}
4297+
}
4298+
break;
4299+
42704300
default:
42714301
#if _ITERATOR_DEBUG_LEVEL != 0
42724302
_STL_REPORT_ERROR("internal stack of regex matcher corrupted");

tests/std/tests/VSO_0000000_regex_use/test.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2129,6 +2129,20 @@ void test_gh_5672() {
21292129
}
21302130
}
21312131

2132+
void test_gh_5774() {
2133+
// GH-5774: Process non-greedy and longest-mode simple loops non-recursively.
2134+
// This extends our test coverage on non-greedy simple loops with bounded number of repetitions.
2135+
g_regexTester.should_not_match("", "a+?");
2136+
g_regexTester.should_not_match("ab", "a{0}?b");
2137+
g_regexTester.should_match("ab", "a{0,1}?b");
2138+
g_regexTester.should_not_match("aab", "a{0,1}?b");
2139+
g_regexTester.should_match("aab", "a{0,2}?b");
2140+
g_regexTester.should_match("aab", "a{1,2}?b");
2141+
g_regexTester.should_not_match("aab", "a{1}?b");
2142+
g_regexTester.should_not_match("aaab", "a{1,2}?b");
2143+
g_regexTester.should_match("aaab", "a{1,3}?b");
2144+
}
2145+
21322146
int main() {
21332147
test_dev10_449367_case_insensitivity_should_work();
21342148
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
@@ -2180,6 +2194,7 @@ int main() {
21802194
test_gh_5509();
21812195
test_gh_5576();
21822196
test_gh_5672();
2197+
test_gh_5774();
21832198

21842199
return g_regexTester.result();
21852200
}

0 commit comments

Comments
 (0)