Skip to content

Commit a166765

Browse files
authored
Support Notones in alternation reductions that merge sets (#118109)
Today, given an alternation like `a|b|[cd]|efg`, that gets reduced to `[abcd]|efg`, supporting One and Set nodes. But it doesn't support Notone nodes. That means the semi-common idiom `.|\n` that folks use to express any character when not using Singleline doesn't get reduced and remains an alternation. This extends the existing reduction pass to also recognize Notones, just by treating them as one or two ranges.
1 parent 4e8a560 commit a166765

File tree

3 files changed

+32
-2
lines changed

3 files changed

+32
-2
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,23 @@ public bool Negate
366366

367367
public void AddChar(char c) => AddRange(c, c);
368368

369+
public void AddNotChar(char c)
370+
{
371+
if (c == 0)
372+
{
373+
AddRange((char)1, LastChar);
374+
}
375+
else if (c == LastChar)
376+
{
377+
AddRange((char)0, (char)(LastChar - 1));
378+
}
379+
else
380+
{
381+
AddRange((char)0, (char)(c - 1));
382+
AddRange((char)(c + 1), LastChar);
383+
}
384+
}
385+
369386
/// <summary>
370387
/// Adds a regex char class
371388
/// </summary>

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,7 +1072,7 @@ void ReduceSingleLetterAndNestedAlternations()
10721072
}
10731073
j--;
10741074
}
1075-
else if (at.Kind is RegexNodeKind.Set or RegexNodeKind.One)
1075+
else if (at.Kind is RegexNodeKind.Set or RegexNodeKind.One or RegexNodeKind.Notone)
10761076
{
10771077
// Cannot merge sets if L or I options differ, or if either are negated.
10781078
optionsAt = at.Options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
@@ -1095,7 +1095,7 @@ void ReduceSingleLetterAndNestedAlternations()
10951095
break;
10961096
}
10971097

1098-
// The last node was a Set or a One, we're a Set or One and our options are the same.
1098+
// The last node was a Set/One/Notone, we're a Set/One/Notone, and our options are the same.
10991099
// Merge the two nodes.
11001100
j--;
11011101
prev = children[j];
@@ -1106,6 +1106,11 @@ void ReduceSingleLetterAndNestedAlternations()
11061106
prevCharClass = new RegexCharClass();
11071107
prevCharClass.AddChar(prev.Ch);
11081108
}
1109+
else if (prev.Kind == RegexNodeKind.Notone)
1110+
{
1111+
prevCharClass = new RegexCharClass();
1112+
prevCharClass.AddNotChar(prev.Ch);
1113+
}
11091114
else
11101115
{
11111116
prevCharClass = RegexCharClass.Parse(prev.Str!);
@@ -1115,6 +1120,10 @@ void ReduceSingleLetterAndNestedAlternations()
11151120
{
11161121
prevCharClass.AddChar(at.Ch);
11171122
}
1123+
else if (at.Kind == RegexNodeKind.Notone)
1124+
{
1125+
prevCharClass.AddNotChar(at.Ch);
1126+
}
11181127
else
11191128
{
11201129
RegexCharClass atCharClass = RegexCharClass.Parse(at.Str!);

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ public class RegexReductionTests
283283
[InlineData("a|b", "[ab]")]
284284
[InlineData("a|b|c|d|e|g|h|z", "[a-eghz]")]
285285
[InlineData("a|b|c|def|g|h", "(?>[a-c]|def|[gh])")]
286+
[InlineData("a|[^a]", @"[\s\S]")]
287+
[InlineData(".|\n", @"[\s\S]")]
288+
[InlineData(".|\n|a", @"[\s\S]")]
289+
[InlineData("abc|.|\n|def", @"abc|[\s\S]|def")]
286290
[InlineData("this|that|there|then|those", "th(?>is|at|ere|en|ose)")]
287291
[InlineData("^this|^that|^there|^then|^those", "^th(?>is|at|ere|en|ose)")]
288292
[InlineData("\bthis|\bthat|\bthere|\bthen|\bthose", "\bth(?>is|at|ere|en|ose)")]

0 commit comments

Comments
 (0)