Skip to content

Commit 2b52fb2

Browse files
authored
Tweak positive lookahead handling in CanBeMadeAtomic (#118221)
* Tweak positive lookahead handling in CanBeMadeAtomic I was overthinking things before, and the handling for positive lookaheads can be simplified and expanded. The logic was saying that a loop followed by an positive lookahead could be made atomic if it was disjoint not only from the lookahead's contents but also from what followed the lookahead. However, the latter isn't actually necessary. If the loop is disjoint with the lookahead, then it's also disjoint with the intersection of the lookahead and what follows / overlaps with the lookahead. * Add a few more tests
1 parent 61126df commit 2b52fb2

File tree

2 files changed

+54
-7
lines changed

2 files changed

+54
-7
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2287,10 +2287,23 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
22872287
{
22882288
switch (subsequent.Kind)
22892289
{
2290+
// Concatenate, capture, and atomic do not impact what comes at the beginning of their children,
2291+
// so we can skip down to the first child.
22902292
case RegexNodeKind.Concatenate:
22912293
case RegexNodeKind.Capture:
22922294
case RegexNodeKind.Atomic:
2295+
2296+
// Similarly, as long as a loop is guaranteed to iterate at least once, we can skip down to the child,
2297+
// as whatever starts it is guaranteed to come after the predecessor.
22932298
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when subsequent.M > 0:
2299+
2300+
// Positive lookaheads can also be skipped through. The lookahead logically comes after the predecessor,
2301+
// and even though it's zero width, we don't need to look at whatever comes after the lookahead, because
2302+
// the lookahead ends up overlapping with its successor. If the node is disjoint from the lookahead, then
2303+
// it's also disjoint from the intersection of the lookahead and the lookahead's successor, since the
2304+
// intersection can only narrow the possible set of characters that need to be considered for overlap with
2305+
// the predecessor node.
2306+
case RegexNodeKind.PositiveLookaround when (subsequent.Options & RegexOptions.RightToLeft) == 0:
22942307
subsequent = subsequent.Child(0);
22952308
continue;
22962309
}
@@ -2330,13 +2343,6 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
23302343
// If it doesn't, then we can upgrade it to being atomic to avoid unnecessary backtracking.
23312344
switch (node.Kind)
23322345
{
2333-
case RegexNodeKind when iterateNullableSubsequent && subsequent.Kind is RegexNodeKind.PositiveLookaround:
2334-
if (!CanBeMadeAtomic(node, subsequent.Child(0), iterateNullableSubsequent: false, allowLazy: allowLazy))
2335-
{
2336-
return false;
2337-
}
2338-
break;
2339-
23402346
case RegexNodeKind.Oneloop:
23412347
case RegexNodeKind.Onelazy when allowLazy:
23422348
switch (subsequent.Kind)

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,18 @@ public class RegexReductionTests
289289
[InlineData(@"abc(?=\A)", @"abc\A")]
290290
[InlineData(@"abc(?=$)", @"abc$")]
291291
[InlineData(@"a*(?=b)bcd", @"(?>a*)(?=b)bcd")]
292+
[InlineData(@"a*(?=b)a", @"(?>a*)(?=b)a")]
293+
[InlineData(@"a+(?=\b)a", @"(?>a+)(?=\b)a")]
294+
[InlineData(@"(?=(?!abc))", @"(?!abc)")]
295+
[InlineData(@"(?=(?:abc))", @"(?=abc)")]
296+
[InlineData(@"(?!((?:a)b(?:c)))", @"(?!abc)")]
297+
[InlineData(@"(?=abc|abd)", @"(?=ab[cd])")]
298+
[InlineData(@"(?!abc|abd)", @"(?!ab[cd])")]
299+
[InlineData(@"(?<!(abc))", @"(?<!abc)")]
300+
[InlineData(@".*\n", @"(?>.*)\n")]
301+
[InlineData(@".*\n+", @"(?>.*)(?>\n+)")]
302+
[InlineData(@"(?((?=a))b)", @"(?((?=a))b|)")]
303+
[InlineData(@"(?((?!a))b)", @"(?((?!a))b|)")]
292304
// Alternation reduction
293305
[InlineData("a|b", "[ab]")]
294306
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
@@ -313,6 +325,15 @@ public class RegexReductionTests
313325
[InlineData("(?:a|)a", "a{1,2}")]
314326
[InlineData("(?:a|)a*", "a*")]
315327
[InlineData("a+(?:a|)", "a+")]
328+
[InlineData(@"ab*(?=c)", @"a(?>b*)(?=c)")]
329+
[InlineData(@"\d+(?=\D)", @"(?>\d+)(?=\D)")]
330+
[InlineData(@"\D+(?=\d)", @"(?>\D+)(?=\d)")]
331+
[InlineData(@"\s+(?=\S)", @"(?>\s+)(?=\S)")]
332+
[InlineData(@"\S+(?=\s)", @"(?>\S+)(?=\s)")]
333+
[InlineData(@"[^\n]+(?=\n)", @"(?>[^\n]+)(?=\n)")]
334+
[InlineData(@"[a-f]+(?=[^a-f])", @"(?>[a-f]+)(?=[^a-f])")]
335+
[InlineData(@"[0-9]*(?=[^0-9])", @"(?>[0-9]*)(?=[^0-9])")]
336+
[InlineData(@"a*(?=b)(?=bc)", @"(?>a*)(?=b)(?=bc)")]
316337
// [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
317338
[InlineData("abcdef|abcde", "abcde(?>f|)")]
318339
[InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]
@@ -575,6 +596,17 @@ public void PatternsReduceIdentically(string actual, string expected)
575596
[InlineData("(abc?)*?d", "(?>(ab(?>c?))*)d")]
576597
[InlineData("(aba)+d", "(?>(aba)+)d")]
577598
[InlineData("(abc*)*d", "(?>(ab(?>c*))*)d")]
599+
[InlineData(@"a*?(?=b)", @"(?>a*?)(?=b)")]
600+
[InlineData(@"a+?(?=b)", @"(?>a+?)(?=b)")]
601+
[InlineData(@"a{1,3}?(?=b)", @"(?>a{1,3}?)(?=b)")]
602+
[InlineData(@"[ab]*(?=a)", @"(?>[ab]*)(?=a)")]
603+
[InlineData(@"\w*(?=\b)", @"(?>\w*)(?=\b)")]
604+
[InlineData(@".*(?=\b)", @"(?>.*)(?=\b)")]
605+
[InlineData(@".*(?=^)", @"(?>.*)(?=^)")]
606+
[InlineData(@"a*(?<=a)", @"(?>a*)(?<=a)")]
607+
[InlineData(@"a+(?<=a)", @"(?>a+)(?<=a)")]
608+
[InlineData(@"\d*(?<=\d)", @"(?>\d*)(?<=\d)")]
609+
[InlineData(@"[ab]*?(?!a)", @"(?>[ab]*?)(?!a)")]
578610
// Lookaround reduction
579611
[InlineData("(?=(abc))", "(?=abc)")]
580612
[InlineData("(?=a(b*)c)", "(?=ab*c)")]
@@ -583,6 +615,15 @@ public void PatternsReduceIdentically(string actual, string expected)
583615
[InlineData(@"a*(?!b)b", @"(?>a*)(?!b)b")]
584616
[InlineData(@"a*(?<!b)cde", @"(?>a*)(?<!b)cde")]
585617
[InlineData(@"a*(?<=b)cde", @"(?>a*)(?<=b)cde")]
618+
[InlineData(@"a*(?=)a", @"(?>a*)(?=)a")]
619+
[InlineData(@"(?<=(ab))", @"(?<=ab)")]
620+
[InlineData(@"(?=(ab|ac))", @"(?=a[bc])")]
621+
[InlineData(@"(?<=(ab|ac))", @"(?<=a[bc])")]
622+
[InlineData(@"(?=ab)|ac", @"a(?=b|c)")]
623+
[InlineData(@"(?=ab)c", @"a(?=bc)")]
624+
[InlineData(@"(?!ab)c", @"a(?!bc)")]
625+
[InlineData(@"(?=a)(?=b)", @"(?=ab)")]
626+
[InlineData(@"(?!ab)(?!ac)", @"(?!a[bc])")]
586627
// Loops inside alternation constructs
587628
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
588629
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]

0 commit comments

Comments
 (0)