Skip to content

Commit 64ccbad

Browse files
authored
Enable lookarounds to influence atomicity (#118153)
* Enable lookarounds to influence atomicity As part of our auto-atomicity handling, today we give up when the subsequent node is a lookaround. This improves it to support the case when the subsequent node is a positive lookahead. * Update src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs
1 parent f9fb081 commit 64ccbad

File tree

3 files changed

+68
-3
lines changed

3 files changed

+68
-3
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2326,11 +2326,17 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
23262326
return true;
23272327
}
23282328

2329-
// If this node is a {one/notone/set}loop, see if it overlaps with its successor in the concatenation.
2330-
// If it doesn't, then we can upgrade it to being a {one/notone/set}loopatomic.
2331-
// Doing so avoids unnecessary backtracking.
2329+
// If this node is a loop, see if it overlaps with its successor in the concatenation.
2330+
// If it doesn't, then we can upgrade it to being atomic to avoid unnecessary backtracking.
23322331
switch (node.Kind)
23332332
{
2333+
case RegexNodeKind when iterateNullableSubsequent && subsequent.Kind is RegexNodeKind.PositiveLookaround:
2334+
if (!CanBeMadeAtomic(node, subsequent.Child(0), iterateNullableSubsequent: false, allowLazy: allowLazy))
2335+
{
2336+
return false;
2337+
}
2338+
break;
2339+
23342340
case RegexNodeKind.Oneloop:
23352341
case RegexNodeKind.Onelazy when allowLazy:
23362342
switch (subsequent.Kind)

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,55 @@ public static IEnumerable<object[]> Match_MemberData()
9393
yield return (@"(?:(?!(b)b)\1a)*", "babababa", RegexOptions.None, 0, 8, true, string.Empty);
9494
yield return (@"(.*?)a(?!(a+)b\2c)\2(.*)", "baaabaac", RegexOptions.None, 0, 8, false, string.Empty);
9595
yield return (@"(?!(abc))+\w\w\w", "abcdef", RegexOptions.None, 0, 6, true, "bcd");
96+
yield return (@"a+(?!c)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
97+
yield return (@"a+(?!c)", "aaac", RegexOptions.None, 0, 4, true, "aa");
98+
yield return (@"a*(?!c)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
99+
yield return (@"a{2,5}(?!c)", "aaaaac", RegexOptions.None, 0, 6, true, "aaaa");
100+
yield return (@"a+?(?!c)", "aaab", RegexOptions.None, 0, 4, true, "a");
101+
yield return (@"a*?(?!c)", "aaab", RegexOptions.None, 0, 4, true, "");
102+
yield return (@"a{2,5}?(?!c)", "aaaaab", RegexOptions.None, 0, 6, true, "aa");
103+
yield return (@"[ab]*(?!x)", "ababc", RegexOptions.None, 0, 5, true, "abab");
104+
yield return (@"a+(?=b)(?!c)", "aabx", RegexOptions.None, 0, 4, true, "aa");
105+
yield return (@"a+?(?=b)(?!c)", "aabx", RegexOptions.None, 0, 4, true, "aa");
106+
107+
// Zero-width positive lookahead assertion
96108
yield return (@"(?=(abc))?\1", "abc", RegexOptions.None, 0, 3, true, "abc");
97109
yield return (@"(?=(abc))+\1", "abc", RegexOptions.None, 0, 3, true, "abc");
98110
yield return (@"(?=(abc))*\1", "abc", RegexOptions.None, 0, 3, true, "abc");
111+
yield return (@"^.*?(?=.)b", "ab", RegexOptions.None, 0, 2, true, "ab");
112+
yield return (@".*?(?=.)b", "ab", RegexOptions.None, 0, 2, true, "ab");
113+
yield return (@"^(?>.*?)(?=.)b", "ab", RegexOptions.None, 0, 2, false, "");
114+
yield return (@"(?>.*?)(?=.)b", "ab", RegexOptions.None, 0, 2, true, "b");
115+
yield return (@"a+(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
116+
yield return (@"a+(?=b)", "aaabc", RegexOptions.None, 0, 5, true, "aaa");
117+
yield return (@"a*(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
118+
yield return (@"a{2,5}(?=b)", "aaaaab", RegexOptions.None, 0, 6, true, "aaaaa");
119+
yield return (@"a+?(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
120+
yield return (@"a*?(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
121+
yield return (@"a{2,5}?(?=b)", "aaaaab", RegexOptions.None, 0, 6, true, "aaaaa");
122+
yield return (@"a+b+(?=c)", "aabbbc", RegexOptions.None, 0, 6, true, "aabbb");
123+
yield return (@"a+?b+(?=c)", "aabbbc", RegexOptions.None, 0, 6, true, "aabbb");
124+
yield return (@"a+b+?(?=c)", "aabbbc", RegexOptions.None, 0, 6, true, "aabbb");
125+
yield return (@"[ab]+(?=c)", "ababc", RegexOptions.None, 0, 5, true, "abab");
126+
yield return (@"[ab]+?(?=c)", "ababc", RegexOptions.None, 0, 5, true, "abab");
127+
yield return (@"\w+(?=\b)", "hello world", RegexOptions.None, 0, 11, true, "hello");
128+
yield return (@"\w+?(?=\b)", "hello world", RegexOptions.None, 0, 11, true, "hello");
129+
yield return (@"(?>a+)(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
130+
yield return (@"(?>a*)(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
131+
yield return (@"(?>a{2,5})(?=b)", "aaaaab", RegexOptions.None, 0, 6, true, "aaaaa");
132+
yield return (@"a*(?=a)", "aaa", RegexOptions.None, 0, 3, true, "aa");
133+
yield return (@"a*?(?=a)", "aaa", RegexOptions.None, 0, 3, true, "");
134+
yield return (@"a+(?=a*b)ab", "aaaab", RegexOptions.None, 0, 5, true, "aaaab");
135+
yield return (@"a+?(?=a*b)ab", "aaaab", RegexOptions.None, 0, 5, true, "aaaab");
136+
yield return (@"(a+)+(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
137+
yield return (@"(a+?)+(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
138+
yield return (@"(a+)+?(?=b)", "aaab", RegexOptions.None, 0, 4, true, "aaa");
139+
yield return (@"(a+|b+)(?=c)", "aaac", RegexOptions.None, 0, 4, true, "aaa");
140+
yield return (@"(a+?|b+?)(?=c)", "aaac", RegexOptions.None, 0, 4, true, "aaa");
141+
yield return (@"(a+)(?=\1b)", "aaaaaab", RegexOptions.None, 0, 7, true, "aaa");
142+
yield return (@"(a+?)(?=\1b)", "aaaaaab", RegexOptions.None, 0, 7, true, "aaa");
143+
yield return (@"[A-Z]+(?=b)", "AAAb", RegexOptions.IgnoreCase, 0, 4, true, "AAA");
144+
yield return (@"[A-Z]+?(?=b)", "AAAb", RegexOptions.IgnoreCase, 0, 4, true, "AAA");
99145

100146
// Zero-width positive lookbehind assertion
101147
yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef");
@@ -136,13 +182,21 @@ public static IEnumerable<object[]> Match_MemberData()
136182
yield return (@"(?<=(abc)+?)", "123abc", RegexOptions.None, 0, 6, true, "");
137183
yield return (@"(?<=(abc)+?)", "123ab", RegexOptions.None, 0, 5, false, "");
138184
yield return (@"(?<=(abc)+?123)", "abcabc123", RegexOptions.None, 0, 9, true, "");
185+
yield return (@"a+(?!c)(?<=y)", "yaab", RegexOptions.None, 0, 4, false, "");
186+
yield return (@"(?<=a{2,4})b+", "aaabbb", RegexOptions.None, 0, 6, true, "bbb");
187+
yield return (@"(?<=a+)b+?", "aaabbb", RegexOptions.None, 0, 6, true, "b");
139188

140189
// Zero-width negative lookbehind assertion: Actual - "(\\w){6}(?<!XXX)def"
141190
yield return (@"(\w){6}(?<!XXX)def", "XXXabcdef", RegexOptions.None, 0, 9, true, "XXXabcdef");
142191
yield return (@"123(?<!$) abcdef", "123 abcdef", RegexOptions.None, 0, 10, true, "123 abcdef");
143192
yield return (@"(abc)\w(?<!(?(1)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
144193
yield return (@"(abc)\w(?<!(?(cd)e|d))", "abcdabc", RegexOptions.None, 0, 7, true, "abcd");
145194
yield return (@"(?<!(b)a)\1", "bb", RegexOptions.None, 0, 2, false, string.Empty); // negative assertion should not capture
195+
yield return (@"(?<=a)b+c", "abbbbc", RegexOptions.None, 0, 6, true, "bbbbc");
196+
yield return (@"(?<=a+)bc", "aaabc", RegexOptions.None, 0, 5, true, "bc");
197+
yield return (@"(?<!x)a+b", "yaab", RegexOptions.None, 0, 4, true, "aab");
198+
yield return (@"(?<!x)a+b", "xaab", RegexOptions.None, 0, 4, true, "ab");
199+
yield return (@"a+(?=b)(?<!x)", "yaab", RegexOptions.None, 0, 4, true, "aa");
146200

147201
// Nonbacktracking subexpression: Actual - "[^0-9]+(?>[0-9]+)3"
148202
// The last 3 causes the match to fail, since the non backtracking subexpression does not give up the last digit it matched

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ public class RegexReductionTests
288288
[InlineData(@"abc(?=\Z)", @"abc\Z")]
289289
[InlineData(@"abc(?=\A)", @"abc\A")]
290290
[InlineData(@"abc(?=$)", @"abc$")]
291+
[InlineData(@"a*(?=b)bcd", @"(?>a*)(?=b)bcd")]
291292
// Alternation reduction
292293
[InlineData("a|b", "[ab]")]
293294
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
@@ -574,6 +575,10 @@ public void PatternsReduceIdentically(string actual, string expected)
574575
[InlineData("(?=(abc))", "(?=abc)")]
575576
[InlineData("(?=a(b*)c)", "(?=ab*c)")]
576577
[InlineData("(?=a((((b))))c)", "(?=abc)")]
578+
[InlineData(@"a*(?=a)", @"(?>a*)(?=a)")]
579+
[InlineData(@"a*(?!b)b", @"(?>a*)(?!b)b")]
580+
[InlineData(@"a*(?<!b)cde", @"(?>a*)(?<!b)cde")]
581+
[InlineData(@"a*(?<=b)cde", @"(?>a*)(?<=b)cde")]
577582
// Loops inside alternation constructs
578583
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
579584
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]

0 commit comments

Comments
 (0)