Skip to content

Commit 028b30f

Browse files
authored
Further reduce loops around zero-width assertions (#118111)
* Further reduce loops around zero-width assertions In addition to replacing the loop with an empty when it contains only a zero-width assertion and has a lower bound of 0, if it doesn't have a lower bound of 0, we can just remove the loop layer, replacing it with its child directly, since any additional iterations beyond one is redundant.
1 parent 96b8cdc commit 028b30f

File tree

3 files changed

+47
-44
lines changed

3 files changed

+47
-44
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 35 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -893,10 +893,7 @@ private RegexNode ReduceLoops()
893893

894894
// If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
895895
// reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
896-
// generally have only produced the latter, but other reductions could have exposed
897-
// this. We can also reduce or eliminate certain loops that are nops, e.g.
898-
// a loop with a minimum of 0 that wraps a zero-width assertion is either asserting something
899-
// or not, and is thus useless.
896+
// generally have only produced the latter, but other reductions could have exposed this.
900897
if (u.ChildCount() == 1)
901898
{
902899
RegexNode child = u.Child(0);
@@ -910,14 +907,27 @@ private RegexNode ReduceLoops()
910907
break;
911908

912909
case RegexNodeKind.Empty:
913-
case RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or
910+
// A loop around an empty is itself empty, regardless of iteration counts.
911+
u = child;
912+
break;
913+
914+
case RegexNodeKind.PositiveLookaround when ContainsKind(child, [RegexNodeKind.Capture]) is false:
915+
case RegexNodeKind.NegativeLookaround or
914916
RegexNodeKind.Beginning or RegexNodeKind.Start or
915917
RegexNodeKind.Bol or RegexNodeKind.Eol or
916918
RegexNodeKind.End or RegexNodeKind.EndZ or
917919
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
918-
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary
919-
when u.M == 0:
920-
u = new RegexNode(RegexNodeKind.Empty, Options);
920+
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
921+
// A loop around (most) zero-width assertions can also be reduced. If it has a lower bound of 0,
922+
// then it's either asserting something or not, and is thus useless and replaceable by empty.
923+
// If it has a lower bound > 0, then the contents are still needed, but the loop isn't, since
924+
// it's non-consuming and thus any more repetitions than 1 are redundant. The one zero-width assertion
925+
// that can't be handled in this way is a PositiveLookaround, because it might contain capture groups
926+
// with captures that must persist past the lookaround (in contrast, negative lookarounds undo all
927+
// captures); if it were to be removed, it could affect both subsequent backreferences as well as access
928+
// to capture information in the resulting Match. Thus, we can only transform a PositiveLookaround in
929+
// this manner if it doesn't contain any captures.
930+
u = u.M == 0 ? new RegexNode(RegexNodeKind.Empty, Options) : child;
921931
break;
922932
}
923933
}
@@ -2067,7 +2077,7 @@ private RegexNode ReduceLookaround()
20672077
// Captures inside of negative lookarounds are undone after the lookaround. Thus, if there's nothing
20682078
// inside of the negative lookaround that needs that capture group (namely a backreference), we can
20692079
// remove the capture.
2070-
if (Kind is RegexNodeKind.NegativeLookaround && ContainsBackreference(Child(0)) is false)
2080+
if (Kind is RegexNodeKind.NegativeLookaround && ContainsKind(Child(0), [RegexNodeKind.Backreference, RegexNodeKind.BackreferenceConditional]) is false)
20712081
{
20722082
if (RemoveCaptures(this, 0))
20732083
{
@@ -2140,26 +2150,32 @@ RegexNodeKind.Beginning or RegexNodeKind.Start or
21402150
RegexNodeKind.Bol or RegexNodeKind.Eol or
21412151
RegexNodeKind.End or RegexNodeKind.EndZ or
21422152
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
2143-
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary;
2153+
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary or
2154+
RegexNodeKind.UpdateBumpalong;
21442155

2145-
/// <summary>Gets whether the node contains a backreference anywhere in its tree.</summary>
2146-
private static bool? ContainsBackreference(RegexNode node)
2156+
/// <summary>Gets whether the node contains any of the specified kinds anywhere in its tree.</summary>
2157+
/// <returns><see langword="true"/> if it does, <see langword="false"/> if it does't, and <see langword="null"/> if it can't be determined.</returns>
2158+
private static bool? ContainsKind(RegexNode node, ReadOnlySpan<RegexNodeKind> kinds)
21472159
{
2148-
if (node.Kind is RegexNodeKind.Backreference or RegexNodeKind.BackreferenceConditional)
2160+
foreach (RegexNodeKind kind in kinds)
21492161
{
2150-
return true;
2162+
if (node.Kind == kind)
2163+
{
2164+
return true;
2165+
}
21512166
}
21522167

21532168
if (!StackHelper.TryEnsureSufficientExecutionStack())
21542169
{
2155-
// If we can't recur further, just stop optimizing.
2170+
// If we can't recur further, just stop optimizing. We need to return null to signal
2171+
// that the result can't be trusted.
21562172
return null;
21572173
}
21582174

21592175
int childCount = node.ChildCount();
21602176
for (int i = 0; i < childCount; i++)
21612177
{
2162-
if (ContainsBackreference(node.Child(i)) is true)
2178+
if (ContainsKind(node.Child(i), kinds) is true)
21632179
{
21642180
return true;
21652181
}
@@ -2796,25 +2812,10 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
27962812
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
27972813
// but can still remain in some situations.
27982814
}
2799-
else if (consumeZeroWidthNodes &&
2800-
// anchors
2801-
child.Kind is RegexNodeKind.Beginning or
2802-
RegexNodeKind.Bol or
2803-
RegexNodeKind.Start or
2804-
// boundaries
2805-
RegexNodeKind.Boundary or
2806-
RegexNodeKind.ECMABoundary or
2807-
RegexNodeKind.NonBoundary or
2808-
RegexNodeKind.NonECMABoundary or
2809-
// lookarounds
2810-
RegexNodeKind.NegativeLookaround or
2811-
RegexNodeKind.PositiveLookaround or
2812-
// logic
2813-
RegexNodeKind.UpdateBumpalong)
2815+
else if (consumeZeroWidthNodes && IsZeroWidthAssertion(child.Kind))
28142816
{
2815-
// Skip over zero-width nodes that might be reasonable at the beginning of or within a substring.
2816-
// We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that
2817-
// may not fully represent the semantics of this portion of the pattern.
2817+
// Skip over zero-width nodes. We can only do these if consumeZeroWidthNodes is true, as otherwise we'd
2818+
// be producing a string that may not fully represent the semantics of this portion of the pattern.
28182819
}
28192820
else
28202821
{

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,9 @@ public static IEnumerable<object[]> Match_MemberData()
9393
yield return (@"(?:(?!(b)b)\1a)*", "babababa", RegexOptions.None, 0, 8, true, string.Empty);
9494
yield return (@"(.*?)a(?!(a+)b\2c)\2(.*)", "baaabaac", RegexOptions.None, 0, 8, false, string.Empty);
9595
yield return (@"(?!(abc))+\w\w\w", "abcdef", RegexOptions.None, 0, 6, true, "bcd");
96+
yield return (@"(?=(abc))?\1", "abc", RegexOptions.None, 0, 3, true, "abc");
97+
yield return (@"(?=(abc))+\1", "abc", RegexOptions.None, 0, 3, true, "abc");
98+
yield return (@"(?=(abc))*\1", "abc", RegexOptions.None, 0, 3, true, "abc");
9699

97100
// Zero-width positive lookbehind assertion
98101
yield return (@"(\w){6}(?<=XXX)def", "abcXXXdef", RegexOptions.None, 0, 9, true, "abcXXXdef");

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,15 @@ public class RegexReductionTests
279279
[InlineData("(?!(abc))", "(?!abc)")]
280280
[InlineData("(?!a(b*)c)", "(?!ab*c)")]
281281
[InlineData("(?!a((((b))))c)", "(?!abc)")]
282+
[InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")]
283+
[InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")]
284+
[InlineData(@"(?=\G)abc", @"\Gabc")]
285+
[InlineData(@"(?=^)abc", @"^abc")]
286+
[InlineData(@"(?=\b)abc", @"\babc")]
287+
[InlineData(@"abc(?=\z)", @"abc\z")]
288+
[InlineData(@"abc(?=\Z)", @"abc\Z")]
289+
[InlineData(@"abc(?=\A)", @"abc\A")]
290+
[InlineData(@"abc(?=$)", @"abc$")]
282291
// Alternation reduction
283292
[InlineData("a|b", "[ab]")]
284293
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
@@ -413,16 +422,6 @@ public class RegexReductionTests
413422
[InlineData(@"\z\z", @"\z")]
414423
[InlineData(@"\G\G", @"\G")]
415424
[InlineData(@"\A\A", @"\A")]
416-
// Lookarounds
417-
[InlineData(@"(?=^)abc", @"^abc")]
418-
[InlineData(@"(?=\G)abc", @"\Gabc")]
419-
[InlineData(@"abc(?=$)", @"abc$")]
420-
[InlineData(@"(?=\b)abc", @"\babc")]
421-
[InlineData(@"abc(?=\z)", @"abc\z")]
422-
[InlineData(@"abc(?=\Z)", @"abc\Z")]
423-
[InlineData(@"abc(?=\A)", @"abc\A")]
424-
[InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")]
425-
[InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")]
426425
// Nothing handling
427426
[InlineData(@"\wabc(?!)def", "(?!)")]
428427
[InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]

0 commit comments

Comments
 (0)