Skip to content

Commit dc6ac3a

Browse files
authored
Normalize some well-known negated sets (#118106)
Some folks end up writing sets like `[^\d]` instead of `\D`, `[^\w]` instead of `\W`, or `[^\s]` instead of `\S`. This defeats some special recognition of the common \D, \W, and \S sets. Normalize them.
1 parent 7a68903 commit dc6ac3a

File tree

3 files changed

+20
-0
lines changed

3 files changed

+20
-0
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,13 @@ internal sealed partial class RegexCharClass
5151

5252
internal const string SpaceClass = "\u0000\u0000\u0001\u0064"; // \s
5353
internal const string NotSpaceClass = "\u0000\u0000\u0001\uFF9C"; // \S
54+
internal const string NegatedSpaceClass = "\u0001\0\u0001d"; // [^\s]
5455
internal const string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; // \w
5556
internal const string NotWordClass = "\u0000\u0000\u000A\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; // \W
57+
internal const string NegatedWordClass = "\u0001\0\n\0\u0002\u0004\u0005\u0003\u0001\u0006\t\u0013\0"; // [^\w]
5658
internal const string DigitClass = "\u0000\u0000\u0001\u0009"; // \d
5759
internal const string NotDigitClass = "\u0000\u0000\u0001\uFFF7"; // \D
60+
internal const string NegatedDigitClass = "\u0001\0\u0001\t"; // [^\d]
5861
internal const string ControlClass = "\0\0\u0001\u000f"; // \p{Cc}
5962
internal const string NotControlClass = "\0\0\u0001\ufff1"; // \P{Cc}
6063
internal const string LetterClass = "\0\0\a\0\u0002\u0004\u0005\u0003\u0001\0"; // \p{L}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -981,6 +981,17 @@ private RegexNode ReduceSet()
981981
case RegexCharClass.NotSpaceSpaceClass:
982982
Str = RegexCharClass.AnyClass;
983983
break;
984+
985+
// Different ways of saying \D, \S, \W
986+
case RegexCharClass.NegatedDigitClass: // [^\d]
987+
Str = RegexCharClass.NotDigitClass;
988+
break;
989+
case RegexCharClass.NegatedSpaceClass: // [^\s]
990+
Str = RegexCharClass.NotSpaceClass;
991+
break;
992+
case RegexCharClass.NegatedWordClass: // [^\w]
993+
Str = RegexCharClass.NotWordClass;
994+
break;
984995
}
985996

986997
return this;

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ namespace System.Text.RegularExpressions.Tests
99
public class RegexReductionTests
1010
{
1111
[Theory]
12+
// Well-known sets
13+
[InlineData(@"[^\d]", @"\D")]
14+
[InlineData(@"[^\w]", @"\W")]
15+
[InlineData(@"[^\s]", @"\S")]
16+
[InlineData(@"[\s\S]", @"[\d\D]")]
17+
[InlineData(@"[\s\S]", @"[\w\W]")]
1218
// Two greedy one loops
1319
[InlineData("a*a*", "a*")]
1420
[InlineData("(a*a*)", "(a*)")]

0 commit comments

Comments
 (0)