Skip to content

Commit 7a68903

Browse files
authored
Remove positive lookarounds that wrap only zero-width assertions (#118091)
A positive lookahead effectively changes its contents to be zero-width. If the contents is already zero-width, the lookaround adds no value.
1 parent c74a167 commit 7a68903

File tree

4 files changed

+41
-13
lines changed

4 files changed

+41
-13
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2086,23 +2086,42 @@ static bool RemoveCaptures(RegexNode parent, int nodeIndex)
20862086
// eliminate any ending backtracking from it.
20872087
EliminateEndingBacktracking();
20882088

2089-
// A positive lookaround wrapped around an empty is a nop, and we can reduce it
2090-
// to simply Empty. A developer typically doesn't write this, but rather it evolves
2091-
// due to optimizations resulting in empty.
2092-
2093-
// A negative lookaround wrapped around an empty child, i.e. (?!), is
2094-
// sometimes used as a way to insert a guaranteed no-match into the expression,
2095-
// often as part of a conditional. We can reduce it to simply Nothing.
2089+
RegexNode child = Child(0);
20962090

2097-
if (Child(0).Kind == RegexNodeKind.Empty)
2091+
// A positive lookahead that wraps a zero-width assertion is useless wrapping and can be removed.
2092+
// Similarly, a positive lookaround wrapped around an empty can be reduced simply to Empty.
2093+
// A developer typically doesn't write this, but rather it evolves due to optimizations resulting in empty.
2094+
if (Kind is RegexNodeKind.PositiveLookaround)
20982095
{
2099-
Kind = Kind == RegexNodeKind.PositiveLookaround ? RegexNodeKind.Empty : RegexNodeKind.Nothing;
2100-
Children = null;
2096+
if (((Options & RegexOptions.RightToLeft) == 0 && IsZeroWidthAssertion(child.Kind)) ||
2097+
child.Kind is RegexNodeKind.Empty)
2098+
{
2099+
return child;
2100+
}
2101+
}
2102+
else if (Kind is RegexNodeKind.NegativeLookaround)
2103+
{
2104+
// A negative lookaround wrapped around an empty child, i.e. (?!), is
2105+
// sometimes used as a way to insert a guaranteed no-match into the expression,
2106+
// often as part of a conditional. We can reduce it to simply Nothing.
2107+
if (child.Kind is RegexNodeKind.Empty)
2108+
{
2109+
Kind = RegexNodeKind.Nothing;
2110+
Children = null;
2111+
}
21012112
}
21022113

21032114
return this;
21042115
}
21052116

2117+
private static bool IsZeroWidthAssertion(RegexNodeKind kind) => kind is
2118+
RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or
2119+
RegexNodeKind.Beginning or RegexNodeKind.Start or
2120+
RegexNodeKind.Bol or RegexNodeKind.Eol or
2121+
RegexNodeKind.End or RegexNodeKind.EndZ or
2122+
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
2123+
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary;
2124+
21062125
/// <summary>Gets whether the node contains a backreference anywhere in its tree.</summary>
21072126
private static bool? ContainsBackreference(RegexNode node)
21082127
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1354,8 +1354,6 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
13541354
case RegexNodeKind.Start:
13551355
case RegexNodeKind.EndZ:
13561356
case RegexNodeKind.End:
1357-
case RegexNodeKind.Boundary:
1358-
case RegexNodeKind.ECMABoundary:
13591357
// Return any anchor found.
13601358
return node.Kind;
13611359

@@ -1389,6 +1387,7 @@ private static RegexNodeKind FindLeadingOrTrailingAnchor(RegexNode node, bool le
13891387
{
13901388
case RegexNodeKind.Empty or RegexNodeKind.NegativeLookaround:
13911389
case RegexNodeKind.PositiveLookaround when ((node.Options | tmpChild.Options) & RegexOptions.RightToLeft) != 0:
1390+
case RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary:
13921391
// Skip over zero-width assertions.
13931392
continue;
13941393

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ public class RegexFindOptimizationsTests
3434
[InlineData(@"(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
3535
[InlineData(@"(?=.*$)abc", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight)]
3636
[InlineData(@"(?=^)abc", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)]
37-
[InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingString_RightToLeft)]
37+
[InlineData(@"abc(?=^)", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning)]
3838
[InlineData(@"(?<!42)(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
3939
[InlineData(@"(?<=^)abc", 0, (int)FindNextStartingPositionMode.LeadingString_LeftToRight)]
4040
[InlineData(@"(?<=^)(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,16 @@ public class RegexReductionTests
403403
[InlineData(@"\z\z", @"\z")]
404404
[InlineData(@"\G\G", @"\G")]
405405
[InlineData(@"\A\A", @"\A")]
406+
// Lookarounds
407+
[InlineData(@"(?=^)abc", @"^abc")]
408+
[InlineData(@"(?=\G)abc", @"\Gabc")]
409+
[InlineData(@"abc(?=$)", @"abc$")]
410+
[InlineData(@"(?=\b)abc", @"\babc")]
411+
[InlineData(@"abc(?=\z)", @"abc\z")]
412+
[InlineData(@"abc(?=\Z)", @"abc\Z")]
413+
[InlineData(@"abc(?=\A)", @"abc\A")]
414+
[InlineData(@"(?=(?=(?=abc)))", @"(?=abc)")]
415+
[InlineData(@"(?=(?<=(?=abc)))", @"(?<=(?=abc))")]
406416
// Nothing handling
407417
[InlineData(@"\wabc(?!)def", "(?!)")]
408418
[InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]

0 commit comments

Comments
 (0)