Skip to content

Commit 97d5ac8

Browse files
authored
Auto-atomic for more loops followed by boundaries (#117892)
Today we will make a loop like `\w+\b` or `\d+\b` atomic, because the only thing the `\b` can match after that point is a non-word character, and that means it can't give back a word character or digit to satisfy the loop. But we can extend that further, since we can use the same logic to make such a loop atomic as long as the only things it can match are any subset of word characters. So, for example `[a-f0-9]+\b`.
1 parent 4314b63 commit 97d5ac8

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,55 @@ public static bool IsWordChar(char ch)
12721272
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
12731273
}
12741274

1275+
/// <summary>Determines whether the characters that match the specified set are known to all be word characters.</summary>
1276+
public static bool IsKnownWordClassSubset(string set)
1277+
{
1278+
// Check for common sets that we know to be subsets of \w.
1279+
if (set is
1280+
WordClass or DigitClass or LetterClass or LetterOrDigitClass or
1281+
AsciiLetterClass or AsciiLetterOrDigitClass or
1282+
HexDigitClass or HexDigitUpperClass or HexDigitLowerClass)
1283+
{
1284+
return true;
1285+
}
1286+
1287+
// Check for sets composed of Unicode categories that are part of \w.
1288+
Span<UnicodeCategory> categories = stackalloc UnicodeCategory[16];
1289+
if (TryGetOnlyCategories(set, categories, out int numCategories, out bool negated) && !negated)
1290+
{
1291+
foreach (UnicodeCategory cat in categories.Slice(0, numCategories))
1292+
{
1293+
if (!IsWordCategory(cat))
1294+
{
1295+
return false;
1296+
}
1297+
}
1298+
1299+
return true;
1300+
}
1301+
1302+
// If we can enumerate every character in the set quickly, do so, checking to see whether they're all in \w.
1303+
if (CanEasilyEnumerateSetContents(set))
1304+
{
1305+
for (int i = SetStartIndex; i < SetStartIndex + set[SetLengthIndex]; i += 2)
1306+
{
1307+
int curSetEnd = set[i + 1];
1308+
for (int c = set[i]; c < curSetEnd; c++)
1309+
{
1310+
if (!CharInClass((char)c, WordClass))
1311+
{
1312+
return false;
1313+
}
1314+
}
1315+
}
1316+
1317+
return true;
1318+
}
1319+
1320+
// Unlikely to be a subset of \w, and we don't know for sure.
1321+
return false;
1322+
}
1323+
12751324
/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
12761325
public static bool IsBoundaryWordChar(char ch)
12771326
{
@@ -1288,10 +1337,13 @@ public static bool IsBoundaryWordChar(char ch)
12881337
int chDiv8 = ch >> 3;
12891338
return (uint)chDiv8 < (uint)ascii.Length ?
12901339
(ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :
1291-
((WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0 ||
1340+
(IsWordCategory(CharUnicodeInfo.GetUnicodeCategory(ch)) ||
12921341
(ch == ZeroWidthJoiner | ch == ZeroWidthNonJoiner));
12931342
}
12941343

1344+
private static bool IsWordCategory(UnicodeCategory category) =>
1345+
(WordCategoriesMask & (1 << (int)category)) != 0;
1346+
12951347
/// <summary>Determines whether the 'a' and 'b' values differ by only a single bit, setting that bit in 'mask'.</summary>
12961348
/// <remarks>This isn't specific to RegexCharClass; it's just a convenient place to host it.</remarks>
12971349
public static bool DifferByOneBit(char a, char b, out int mask)

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2181,7 +2181,7 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
21812181

21822182
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
21832183
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
2184-
case RegexNodeKind.Boundary when node.M > 0 && node.Str is RegexCharClass.WordClass or RegexCharClass.DigitClass:
2184+
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!):
21852185
case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
21862186
case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
21872187
case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,15 @@ public class RegexReductionTests
341341
[InlineData("\\d+\\b", "(?>\\d+)\\b")]
342342
[InlineData("\\W+\\B", "(?>\\W+)\\B")]
343343
[InlineData("\\D+\\B", "(?>\\D+)\\B")]
344+
[InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")]
345+
[InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")]
346+
[InlineData(@"[A-Z]+\b", @"(?>[A-Z]+)\b")]
347+
[InlineData(@"[a-zA-Z]+\b", @"(?>[a-zA-Z]+)\b")]
348+
[InlineData(@"[a-fA-F0-9]+\b", @"(?>[a-fA-F0-9]+)\b")]
349+
[InlineData(@"[A-F0-9]+\b", @"(?>[A-F0-9]+)\b")]
350+
[InlineData(@"[a-f0-9]+\b", @"(?>[a-f0-9]+)\b")]
351+
[InlineData(@"[\p{L}\d]+\b", @"(?>[\p{L}\d]+)\b")]
352+
[InlineData(@"[\p{L}\p{Mn}]+\b", @"(?>[\p{L}\p{Mn}]+)\b")]
344353
[InlineData(@"\d+\D", @"(?>\d+)\D")]
345354
[InlineData(@"\D+\d", @"(?>\D+)\d")]
346355
[InlineData(@"\s+\S", @"(?>\s+)\S")]
@@ -494,6 +503,9 @@ public void PatternsReduceIdentically(string actual, string expected)
494503
[InlineData(@"\d*\b", @"(?>\d*)\b")]
495504
[InlineData(@"\W*\B", @"(?>\W*)\B")]
496505
[InlineData(@"\D*\B", @"(?>\D*)\B")]
506+
[InlineData(@"\b[a-z ]+\b", @"\b(?>[a-z ]+)\b")]
507+
[InlineData(@"\b[\p{L}\p{Mn}a]+\b", @"\b(?>[\p{L}\p{Mn}a]+)\b")]
508+
[InlineData(@"\b[\p{C}]+\b", @"\b(?>[\p{C}]+)\b")]
497509
// Loops inside alternation constructs
498510
[InlineData("(abc*|def)chi", "(ab(?>c*)|def)chi")]
499511
[InlineData("(abc|def*)fhi", "(abc|de(?>f*))fhi")]

0 commit comments

Comments
 (0)