Skip to content

Commit 27cc30f

Browse files
authored
Transform regex X| into X? and |X into X?? (#118087)
An alternation with two branches where the second is empty is the same as the first branch just being an optional loop; similarly, when the first branch is empty, it's a lazy optional loop of the second branch. Expressing as an optional is better optimized elsewhere in the regex transforms, e.g. with loop coalescing, so we're better off with the optional representation.
1 parent 865d4e6 commit 27cc30f

File tree

2 files changed

+27
-0
lines changed

2 files changed

+27
-0
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -790,6 +790,11 @@ private RegexNode ReduceAtomic()
790790
start = endExclusive;
791791
}
792792

793+
// Force a re-reduction if we know we've exposed new opportunities that'll be handled.
794+
reordered |=
795+
child.ChildCount() == 2 &&
796+
(child.Child(0).Kind is RegexNodeKind.Empty || child.Child(1).Kind is RegexNodeKind.Empty); // can be transformed into a ? or ??
797+
793798
// If anything was reordered, there may be new optimization opportunities inside
794799
// of the alternation, so reduce it again.
795800
if (reordered)
@@ -1032,6 +1037,22 @@ private RegexNode ReduceAlternation()
10321037
if (node.Kind == RegexNodeKind.Alternate)
10331038
{
10341039
node = RemoveRedundantEmptiesAndNothings(node);
1040+
1041+
// If the alternation is actually just a ? or ?? in disguise, transform it accordingly.
1042+
// (a|) becomes a?
1043+
// (|a) becomes a??
1044+
// Such "optional" nodes are processed more efficiently, including being able to be better coalesced with surrounding nodes.
1045+
if (node.Kind is RegexNodeKind.Alternate && node.ChildCount() == 2)
1046+
{
1047+
if (node.Child(1).Kind is RegexNodeKind.Empty)
1048+
{
1049+
node = node.Child(0).MakeQuantifier(lazy: false, min: 0, max: 1);
1050+
}
1051+
else if (node.Child(0).Kind is RegexNodeKind.Empty)
1052+
{
1053+
node = node.Child(1).MakeQuantifier(lazy: true, min: 0, max: 1);
1054+
}
1055+
}
10351056
}
10361057
}
10371058
}

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,12 @@ public class RegexReductionTests
306306
[InlineData("abcd|aefg", "a(?>bcd|efg)")]
307307
[InlineData("abcd|abc|ab|a", "a(?>bcd|bc|b|)")]
308308
[InlineData("^abcd|^abce", "^(?:abc[de])")]
309+
[InlineData("abc|", "(?:abc)?")]
310+
[InlineData("a|", "a?")]
311+
[InlineData("(?:abc|)d", "(?>(?:abc)?)d")]
312+
[InlineData("(?:a|)a", "a{1,2}")]
313+
[InlineData("(?:a|)a*", "a*")]
314+
[InlineData("a+(?:a|)", "a+")]
309315
// [InlineData("abcde|abcdef", "abcde(?>|f)")] // TODO https://github.com/dotnet/runtime/issues/66031: Need to reorganize optimizations to avoid an extra Empty being left at the end of the tree
310316
[InlineData("abcdef|abcde", "abcde(?>f|)")]
311317
[InlineData("abcdef|abcdeg|abcdeh|abcdei|abcdej|abcdek|abcdel", "abcde[f-l]")]

0 commit comments

Comments
 (0)