Skip to content

Commit a8c169a

Browse files
authored
Replace nop regex loops with empty (#118079)
When loop bodies end up containing zero-width assertions and the loop has a min bound of 0, the whole loop can be removed, as the zero-width assertion may or may not apply.
1 parent 197f38a commit a8c169a

File tree

3 files changed

+34
-4
lines changed

3 files changed

+34
-4
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -894,7 +894,9 @@ private RegexNode ReduceLoops()
894894
// If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
895895
// reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
896896
// generally have only produced the latter, but other reductions could have exposed
897-
// this.
897+
// this. We can also reduce or eliminate certain loops that are nops, e.g.
898+
// a loop with a minimum of 0 that wraps a zero-width assertion is either asserting something
899+
// or not, and is thus useless.
898900
if (u.ChildCount() == 1)
899901
{
900902
RegexNode child = u.Child(0);
@@ -906,6 +908,17 @@ private RegexNode ReduceLoops()
906908
child.MakeRep(u.Kind == RegexNodeKind.Lazyloop ? RegexNodeKind.Onelazy : RegexNodeKind.Oneloop, u.M, u.N);
907909
u = child;
908910
break;
911+
912+
case RegexNodeKind.Empty:
913+
case RegexNodeKind.PositiveLookaround or RegexNodeKind.NegativeLookaround or
914+
RegexNodeKind.Beginning or RegexNodeKind.Start or
915+
RegexNodeKind.Bol or RegexNodeKind.Eol or
916+
RegexNodeKind.End or RegexNodeKind.EndZ or
917+
RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary or
918+
RegexNodeKind.NonBoundary or RegexNodeKind.NonECMABoundary
919+
when u.M == 0:
920+
u = new RegexNode(RegexNodeKind.Empty, Options);
921+
break;
909922
}
910923
}
911924

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexFindOptimizationsTests.cs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,8 @@ public class RegexFindOptimizationsTests
4343
[InlineData(@"(?=\b)^abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
4444
[InlineData(@"(?=\b)(?=^.*$)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
4545
[InlineData(@"(?=\b)(?=\B)^abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
46-
// The next two could be improved slightly to be LeadingString_LeftToRight.
47-
[InlineData(@"(?=^.*def)?abc", 0, (int)FindNextStartingPositionMode.FixedDistanceChar_LeftToRight)]
48-
[InlineData(@"(?=^)?(?=^)abc", 0, (int)FindNextStartingPositionMode.FixedDistanceChar_LeftToRight)]
46+
[InlineData(@"(?=^.*def)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
47+
[InlineData(@"(?=^)(?=^)abc", 0, (int)FindNextStartingPositionMode.LeadingAnchor_LeftToRight_Beginning)]
4948

5049
[InlineData(@"^", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning)]
5150
[InlineData(@"hello^", (int)RegexOptions.RightToLeft, (int)FindNextStartingPositionMode.LeadingAnchor_RightToLeft_Beginning)]

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,19 @@ public class RegexReductionTests
244244
// Large loop patterns
245245
[InlineData("a*a*a*a*a*a*a*b*b*?a+a*", "a*b*b*?a+")]
246246
[InlineData("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "a{0,30}aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")]
247+
// Nop loops
248+
[InlineData("(?:)*", "")]
249+
[InlineData("a(?=abc)*b", "ab")]
250+
[InlineData("a(?<=abc)*b", "ab")]
251+
[InlineData("a(?<!abc)*b", "ab")]
252+
[InlineData("a$*b", "ab")]
253+
[InlineData("a^?b", "ab")]
254+
[InlineData(@"a\b*b", "ab")]
255+
[InlineData(@"a\B*b", "ab")]
256+
[InlineData(@"a\z?b", "ab")]
257+
[InlineData(@"a\Z?b", "ab")]
258+
[InlineData(@"a\A?b", "ab")]
259+
[InlineData(@"a\G?b", "ab")]
247260
// Group elimination
248261
[InlineData("(?:(?:(?:(?:(?:(?:a*))))))", "a*")]
249262
// Nested loops
@@ -542,6 +555,11 @@ public void PatternsReduceIdentically(string actual, string expected)
542555
[InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")]
543556
// Different prefixes on alternation branches
544557
[InlineData("^abcd|$abce", "^abcd|^abce")]
558+
// Zero-width assertions in non-removable loops
559+
[InlineData("a(?=abc)+b", "ab")]
560+
[InlineData("a(?<=abc)+b", "ab")]
561+
[InlineData("a(?<!abc){1,2}b", "ab")]
562+
[InlineData("a${3,}b", "ab")]
545563
// Anchors
546564
[InlineData(@"\b\B", "\b")]
547565
[InlineData(@"^$", "^")]

0 commit comments

Comments
 (0)