Skip to content

Commit 197f38a

Browse files
authored
Coalesce adjacent equivalent anchors (#118083)
Sometimes you see patterns where folks have put the same anchor multiple times in a row, e.g. `\b\b`. The subsequent anchors are nops and can just be removed.
1 parent 7f96aef commit 197f38a

File tree

2 files changed

+24
-0
lines changed

2 files changed

+24
-0
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1832,6 +1832,16 @@ static bool CanCombineCounts(int nodeMin, int nodeMax, int nextMin, int nextMax)
18321832
currentNode.MakeRep(RegexNodeKind.Oneloop, 2, 2);
18331833
next++;
18341834
continue;
1835+
1836+
// Coalescing identical anchors (e.g. \b\b). These don't need to become loops, as they collapse to a single anchor.
1837+
case RegexNodeKind.Beginning or RegexNodeKind.Start or
1838+
RegexNodeKind.End or RegexNodeKind.EndZ or
1839+
RegexNodeKind.Bol or RegexNodeKind.Eol or
1840+
RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or
1841+
RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary
1842+
when nextNode.Kind == currentNode.Kind:
1843+
next++;
1844+
continue;
18351845
}
18361846
}
18371847

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,16 @@ public class RegexReductionTests
376376
[InlineData("(abc?)*?d", "(ab(?>c?))*?d")]
377377
[InlineData("(ab*c)*d", "(?>(a(?>b*)c)*)d")]
378378
[InlineData("(aba)?d", "(?>(aba)?)d")]
379+
// Anchors
380+
[InlineData(@"\b\b", @"\b")]
381+
[InlineData(@"\b\b\b\b\b", @"\b")]
382+
[InlineData(@"\B\B", @"\B")]
383+
[InlineData(@"^^", @"^")]
384+
[InlineData(@"$", @"$")]
385+
[InlineData(@"\Z\Z", @"\Z")]
386+
[InlineData(@"\z\z", @"\z")]
387+
[InlineData(@"\G\G", @"\G")]
388+
[InlineData(@"\A\A", @"\A")]
379389
// Nothing handling
380390
[InlineData(@"\wabc(?!)def", "(?!)")]
381391
[InlineData(@"\wabc(?!)def|ghi(?!)", "(?!)")]
@@ -532,6 +542,10 @@ public void PatternsReduceIdentically(string actual, string expected)
532542
[InlineData("a*(?(xyz)bcd)", "(?>a*)(?(xyz)bcd)")]
533543
// Different prefixes on alternation branches
534544
[InlineData("^abcd|$abce", "^abcd|^abce")]
545+
// Anchors
546+
[InlineData(@"\b\B", "\b")]
547+
[InlineData(@"^$", "^")]
548+
[InlineData(@"^$", "$")]
535549
public void PatternsReduceDifferently(string actual, string expected)
536550
{
537551
// NOTE: RegexNode.ToString is only compiled into debug builds, so DEBUG is currently set on the unit tests project.

0 commit comments

Comments
 (0)