Skip to content

Commit f981bc2

Browse files
authored
Augment regex source generator's DescribeSet to better describe common sets (#118340)
This only impacts comments in the emitted code. Now e.g. rather than a comment like: ```csharp Match a character in the set [\p{Lu}]. ``` it'll say ```csharp Match a Unicode uppercase letter. ```
1 parent 3ef8cf5 commit f981bc2

File tree

2 files changed

+88
-7
lines changed

2 files changed

+88
-7
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 85 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5632,23 +5632,104 @@ private static string DescribeCapture(int capNum, RegexMethod rm)
56325632
}
56335633

56345634
/// <summary>Gets a textual description of what characters match a set.</summary>
5635-
private static string DescribeSet(string charClass) =>
5636-
charClass switch
5635+
private static string DescribeSet(string charClass)
5636+
{
5637+
string? description = charClass switch
56375638
{
56385639
RegexCharClass.AnyClass => "any character",
5639-
RegexCharClass.DigitClass => "a Unicode digit",
5640+
RegexCharClass.AsciiLetterClass => "an ASCII letter",
5641+
RegexCharClass.AsciiLetterOrDigitClass => "an ASCII letter or digit",
56405642
RegexCharClass.ECMASpaceClass => "a whitespace character (ECMA)",
56415643
RegexCharClass.ECMAWordClass => "a word character (ECMA)",
5644+
RegexCharClass.HexDigitClass => "a hexadecimal digit",
5645+
RegexCharClass.HexDigitLowerClass => "a lowercase hexadecimal digit",
5646+
RegexCharClass.HexDigitUpperClass => "an uppercase hexadecimal digit",
5647+
RegexCharClass.LetterClass => "a Unicode letter",
5648+
RegexCharClass.LetterOrDigitClass => "a Unicode letter or digit",
5649+
RegexCharClass.NotAsciiLetterClass => "any character other than an ASCII letter",
5650+
RegexCharClass.NotAsciiLetterOrDigitClass => "any character other than an ASCII letter or digit",
5651+
RegexCharClass.NotControlClass => "any character other than a Unicode control character",
56425652
RegexCharClass.NotDigitClass => "any character other than a Unicode digit",
56435653
RegexCharClass.NotECMASpaceClass => "any character other than a whitespace character (ECMA)",
56445654
RegexCharClass.NotECMAWordClass => "any character other than a word character (ECMA)",
5655+
RegexCharClass.NotHexDigitClass => "any character other than a hexadecimal digit",
5656+
RegexCharClass.NotHexDigitLowerClass => "any character other than a lowercase hexadecimal digit",
5657+
RegexCharClass.NotHexDigitUpperClass => "any character other than an uppercase hexadecimal digit",
5658+
RegexCharClass.NotLetterClass => "any character other than a Unicode letter",
5659+
RegexCharClass.NotLetterOrDigitClass => "any character other than a Unicode letter or digit",
5660+
RegexCharClass.NotLowerClass => "any character other than a Unicode lowercase letter",
5661+
RegexCharClass.NotNumberClass => "any character other than a Unicode number",
5662+
RegexCharClass.NotPunctuationClass => "any character other than a Unicode punctuation character",
5663+
RegexCharClass.NotSeparatorClass => "any character other than a Unicode separator",
56455664
RegexCharClass.NotSpaceClass => "any character other than a whitespace character",
5665+
RegexCharClass.NotSymbolClass => "any character other than a Unicode symbol",
5666+
RegexCharClass.NotUpperClass => "any character other than a Unicode uppercase letter",
56465667
RegexCharClass.NotWordClass => "any character other than a word character",
5668+
RegexCharClass.NumberClass => "a Unicode number",
5669+
RegexCharClass.PunctuationClass => "a Unicode punctuation character",
5670+
RegexCharClass.SeparatorClass => "a Unicode separator",
56475671
RegexCharClass.SpaceClass => "a whitespace character",
5672+
RegexCharClass.SymbolClass => "a Unicode symbol",
56485673
RegexCharClass.WordClass => "a word character",
5649-
_ => $"a character in the set {RegexCharClass.DescribeSet(charClass)}",
5674+
_ => null,
56505675
};
56515676

5677+
if (description is not null)
5678+
{
5679+
return description;
5680+
}
5681+
5682+
Span<UnicodeCategory> categories = stackalloc UnicodeCategory[1];
5683+
if (RegexCharClass.TryGetOnlyCategories(charClass, categories, out int numCategories, out bool negatedCategories) &&
5684+
numCategories == 1)
5685+
{
5686+
ReadOnlySpan<string?> categoryDescriptions =
5687+
[
5688+
"a Unicode uppercase letter", // UppercaseLetter = 0,
5689+
"a Unicode lowercase letter", // LowercaseLetter = 1,
5690+
"a Unicode titlecase letter", // TitlecaseLetter = 2,
5691+
"a Unicode modifier letter", // ModifierLetter = 3,
5692+
null, // OtherLetter = 4,
5693+
"a Unicode non-spacing mark", // NonSpacingMark = 5,
5694+
"a Unicode spacing-combining mark", // SpacingCombiningMark = 6,
5695+
"a Unicode enclosing mark", // EnclosingMark = 7,
5696+
"a Unicode digit", // DecimalDigitNumber = 8,
5697+
"a Unicode letter number", // LetterNumber = 9,
5698+
null, // OtherNumber = 10,
5699+
"a Unicode space separator", // SpaceSeparator = 11,
5700+
"a Unicode line separator", // LineSeparator = 12,
5701+
"a Unicode paragraph separator", // ParagraphSeparator = 13,
5702+
"a Unicode control character", // Control = 14,
5703+
"a Unicode format character", // Format = 15,
5704+
"a Unicode surrogate character", // Surrogate = 16,
5705+
"a Unicode private-use character", // PrivateUse = 17,
5706+
"a Unicode connector punctuation character", // ConnectorPunctuation = 18,
5707+
"a Unicode dash punctuation character", // DashPunctuation = 19,
5708+
"a Unicode open punctuation character", // OpenPunctuation = 20,
5709+
"a Unicode close punctuation character", // ClosePunctuation = 21,
5710+
"a Unicode initial quote punctuation character", // InitialQuotePunctuation = 22,
5711+
"a Unicode final quote punctuation character", // FinalQuotePunctuation = 23,
5712+
null, // OtherPunctuation = 24,
5713+
"a Unicode math symbol", // MathSymbol = 25,
5714+
"a Unicode currency symbol", // CurrencySymbol = 26,
5715+
"a Unicode modifier symbol", // ModifierSymbol = 27,
5716+
null, // OtherSymbol = 28,
5717+
"an unassigned Unicode code point", // OtherNotAssigned = 29,
5718+
];
5719+
5720+
int cat = (int)categories[0];
5721+
if ((uint)cat < (uint)categoryDescriptions.Length &&
5722+
(description = categoryDescriptions[cat]) is not null)
5723+
{
5724+
return negatedCategories ?
5725+
$"any character other than {description}" :
5726+
description;
5727+
}
5728+
}
5729+
5730+
return $"a character in the set {RegexCharClass.DescribeSet(charClass)}";
5731+
}
5732+
56525733
/// <summary>Writes a textual description of the node tree fit for rending in source.</summary>
56535734
/// <param name="writer">The writer to which the description should be written.</param>
56545735
/// <param name="node">The node being written.</param>

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/RegexGeneratorOutputTests.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ partial class C
790790
/// <code>[A-Za-z]+</code><br/>
791791
/// Explanation:<br/>
792792
/// <code>
793-
/// ○ Match a character in the set [A-Za-z] atomically at least once.<br/>
793+
/// ○ Match an ASCII letter atomically at least once.<br/>
794794
/// </code>
795795
/// </remarks>
796796
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Text.RegularExpressions.Generator", "%VERSION%")]
@@ -864,7 +864,7 @@ private bool TryFindNextPossibleStartingPosition(ReadOnlySpan<char> inputSpan)
864864
// Empty matches aren't possible.
865865
if ((uint)pos < (uint)inputSpan.Length)
866866
{
867-
// The pattern begins with a character in the set [A-Za-z].
867+
// The pattern begins with an ASCII letter.
868868
// Find the next occurrence. If it can't be found, there's no match.
869869
int i = inputSpan.Slice(pos).IndexOfAny(Utilities.s_asciiLetters);
870870
if (i >= 0)
@@ -888,7 +888,7 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
888888
int matchStart = pos;
889889
ReadOnlySpan<char> slice = inputSpan.Slice(pos);
890890
891-
// Match a character in the set [A-Za-z] atomically at least once.
891+
// Match an ASCII letter atomically at least once.
892892
{
893893
int iteration = slice.IndexOfAnyExcept(Utilities.s_asciiLetters);
894894
if (iteration < 0)

0 commit comments

Comments
 (0)