Skip to content

Commit 5196f3e

Browse files
authored
Avoid extra boundary checks when preceeded/succeeded char set is known (#118105)
* Avoid extra boundary checks when preceeded/succeeded char set is known If we statically know by construction that what comes before or after a \b is guaranteed to be a word char, then we can avoid half the run-time checks. This also tweaks the source-generated implementation of IsBoundaryWordChar in order to avoid an extra branch on every check. It's currently delegating to IsWordChar and then if that returns false, checking whether it's one of the other two joiner characters that are considered as part of the boundary set. Instead, this duplicates the IsWordChar implementation (which is just a couple of lines once the helpers are separated out into their own members), such that for ASCII, the additional check isn't necessary. The implementation used by the interpreter and RegexCompiler already do this. * Address PR feedback
1 parent 307753a commit 5196f3e

File tree

6 files changed

+282
-90
lines changed

6 files changed

+282
-90
lines changed

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 166 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -302,94 +302,190 @@ private static string GetTimeoutExpression(int matchTimeout) =>
302302
"Regex.InfiniteMatchTimeout" :
303303
$"TimeSpan.FromMilliseconds({matchTimeout.ToString(CultureInfo.InvariantCulture)})";
304304

305+
private const string IsBoundary = nameof(IsBoundary);
306+
private const string IsECMABoundary = nameof(IsECMABoundary);
307+
private const string IsWordChar = nameof(IsWordChar);
308+
private const string IsBoundaryWordChar = nameof(IsBoundaryWordChar);
309+
private const string IsPostWordCharBoundary = nameof(IsPostWordCharBoundary);
310+
private const string IsPreWordCharBoundary = nameof(IsPreWordCharBoundary);
311+
private const string IsECMABoundaryWordChar = nameof(IsECMABoundaryWordChar);
312+
private const string WordCategoriesMask = nameof(WordCategoriesMask);
313+
private const string WordCharBitmap = nameof(WordCharBitmap);
314+
315+
private static void AddWordCharHelpersSupport(Dictionary<string, string[]> requiredHelpers)
316+
{
317+
const string WordCharHelpersSupport = nameof(WordCharHelpersSupport);
318+
if (!requiredHelpers.ContainsKey(WordCharHelpersSupport))
319+
{
320+
requiredHelpers.Add(WordCharHelpersSupport,
321+
[
322+
"/// <summary>Provides a mask of Unicode categories that combine to form [\\w].</summary>",
323+
$"private const int {WordCategoriesMask} =",
324+
" 1 << (int)UnicodeCategory.UppercaseLetter |",
325+
" 1 << (int)UnicodeCategory.LowercaseLetter |",
326+
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
327+
" 1 << (int)UnicodeCategory.ModifierLetter |",
328+
" 1 << (int)UnicodeCategory.OtherLetter |",
329+
" 1 << (int)UnicodeCategory.NonSpacingMark |",
330+
" 1 << (int)UnicodeCategory.DecimalDigitNumber |",
331+
" 1 << (int)UnicodeCategory.ConnectorPunctuation;",
332+
"",
333+
"/// <summary>Gets a bitmap for whether each character 0 through 127 is in [\\w]</summary>",
334+
$"private static ReadOnlySpan<byte> {WordCharBitmap} => new byte[]",
335+
"{",
336+
" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,",
337+
" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07",
338+
"};",
339+
]);
340+
}
341+
}
342+
305343
/// <summary>Adds the IsWordChar helper to the required helpers collection.</summary>
306344
private static void AddIsWordCharHelper(Dictionary<string, string[]> requiredHelpers)
307345
{
308-
const string IsWordChar = nameof(IsWordChar);
309346
if (!requiredHelpers.ContainsKey(IsWordChar))
310347
{
311348
requiredHelpers.Add(IsWordChar,
312349
[
313-
"/// <summary>Determines whether the character is part of the [\\w] set.</summary>",
314-
"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
315-
"internal static bool IsWordChar(char ch)",
316-
"{",
317-
" // Mask of Unicode categories that combine to form [\\w]",
318-
" const int WordCategoriesMask =",
319-
" 1 << (int)UnicodeCategory.UppercaseLetter |",
320-
" 1 << (int)UnicodeCategory.LowercaseLetter |",
321-
" 1 << (int)UnicodeCategory.TitlecaseLetter |",
322-
" 1 << (int)UnicodeCategory.ModifierLetter |",
323-
" 1 << (int)UnicodeCategory.OtherLetter |",
324-
" 1 << (int)UnicodeCategory.NonSpacingMark |",
325-
" 1 << (int)UnicodeCategory.DecimalDigitNumber |",
326-
" 1 << (int)UnicodeCategory.ConnectorPunctuation;",
327-
"",
328-
" // Bitmap for whether each character 0 through 127 is in [\\w]",
329-
" ReadOnlySpan<byte> ascii = new byte[]",
330-
" {",
331-
" 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,",
332-
" 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07",
333-
" };",
334-
"",
335-
" // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
336-
" int chDiv8 = ch >> 3;",
337-
" return (uint)chDiv8 < (uint)ascii.Length ?",
338-
" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
339-
" (WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
340-
"}",
350+
$"/// <summary>Determines whether the character is part of the [\\w] set.</summary>",
351+
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
352+
$"internal static bool {IsWordChar}(char ch)",
353+
$"{{",
354+
$" // If the char is ASCII, look it up in the bitmap. Otherwise, query its Unicode category.",
355+
$" ReadOnlySpan<byte> ascii = {WordCharBitmap};",
356+
$" int chDiv8 = ch >> 3;",
357+
$" return (uint)chDiv8 < (uint)ascii.Length ?",
358+
$" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
359+
$" ({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;",
360+
$"}}",
361+
]);
362+
363+
AddWordCharHelpersSupport(requiredHelpers);
364+
}
365+
}
366+
367+
/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
368+
private static void AddIsBoundaryWordCharHelper(Dictionary<string, string[]> requiredHelpers)
369+
{
370+
if (!requiredHelpers.ContainsKey(IsBoundaryWordChar))
371+
{
372+
requiredHelpers.Add(IsBoundaryWordChar,
373+
[
374+
$"/// <summary>Determines whether the specified index is a boundary word character.</summary>",
375+
$"/// <remarks>This is the same as \\w plus U+200C ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.</remarks>",
376+
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
377+
$"internal static bool {IsBoundaryWordChar}(char ch)",
378+
$"{{",
379+
$" ReadOnlySpan<byte> ascii = {WordCharBitmap};",
380+
$" int chDiv8 = ch >> 3;",
381+
$" return (uint)chDiv8 < (uint)ascii.Length ?",
382+
$" (ascii[chDiv8] & (1 << (ch & 0x7))) != 0 :",
383+
$" (({WordCategoriesMask} & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0) || (ch is '\u200C' or '\u200D');",
384+
$"}}",
385+
]);
386+
387+
AddWordCharHelpersSupport(requiredHelpers);
388+
}
389+
}
390+
391+
/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
392+
private static void AddIsECMABoundaryWordCharHelper(Dictionary<string, string[]> requiredHelpers)
393+
{
394+
if (!requiredHelpers.ContainsKey(IsECMABoundaryWordChar))
395+
{
396+
requiredHelpers.Add(IsECMABoundaryWordChar,
397+
[
398+
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript) word character.</summary>",
399+
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
400+
$"internal static bool {IsECMABoundaryWordChar}(char ch) =>",
401+
$" char.IsAsciiLetterOrDigit(ch) ||",
402+
$" ch is '_' or '\\u0130'; // latin capital letter I with dot above",
341403
]);
342404
}
343405
}
344406

345407
/// <summary>Adds the IsBoundary helper to the required helpers collection.</summary>
346408
private static void AddIsBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
347409
{
348-
const string IsBoundary = nameof(IsBoundary);
349410
if (!requiredHelpers.ContainsKey(IsBoundary))
350411
{
351412
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
352413
requiredHelpers.Add(IsBoundary,
353414
[
354415
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
355416
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
356-
$"internal static bool IsBoundary(ReadOnlySpan<char> inputSpan, int index)",
417+
$"internal static bool {IsBoundary}(ReadOnlySpan<char> inputSpan, int index)",
357418
$"{{",
358419
$" int indexMinus1 = index - 1;",
359-
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[indexMinus1])) !=",
360-
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsBoundaryWordChar(inputSpan[index]));",
361-
$"",
362-
$" static bool IsBoundaryWordChar(char ch) => IsWordChar(ch) || (ch == '\\u200C' | ch == '\\u200D');",
420+
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[indexMinus1])) !=",
421+
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsBoundaryWordChar}(inputSpan[index]));",
363422
$"}}",
364423
]);
365424

366-
AddIsWordCharHelper(requiredHelpers);
425+
AddIsBoundaryWordCharHelper(requiredHelpers);
426+
}
427+
}
428+
429+
/// <summary>Adds the IsPreWordCharBoundary helper to the required helpers collection.</summary>
430+
private static void AddIsPreWordCharBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
431+
{
432+
if (!requiredHelpers.ContainsKey(IsPreWordCharBoundary))
433+
{
434+
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
435+
requiredHelpers.Add(IsPreWordCharBoundary,
436+
[
437+
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
438+
$"/// <remarks>This variant is only employed when the subsequent character will separately be validated as a word character.</remarks>",
439+
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
440+
$"internal static bool {IsPreWordCharBoundary}(ReadOnlySpan<char> inputSpan, int index)",
441+
$"{{",
442+
$" int indexMinus1 = index - 1;",
443+
$" return {uncheckedKeyword}((uint)indexMinus1 >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[indexMinus1]));",
444+
$"}}",
445+
]);
446+
447+
AddIsBoundaryWordCharHelper(requiredHelpers);
448+
}
449+
}
450+
451+
/// <summary>Adds the IsPostWordCharBoundary helper to the required helpers collection.</summary>
452+
private static void AddIsPostWordCharBoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
453+
{
454+
if (!requiredHelpers.ContainsKey(IsPostWordCharBoundary))
455+
{
456+
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
457+
requiredHelpers.Add(IsPostWordCharBoundary,
458+
[
459+
$"/// <summary>Determines whether the specified index is a boundary.</summary>",
460+
$"/// <remarks>This variant is only employed when the previous character has already been validated as a word character.</remarks>",
461+
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
462+
$"internal static bool {IsPostWordCharBoundary}(ReadOnlySpan<char> inputSpan, int index) =>",
463+
$" {uncheckedKeyword}((uint)index >= (uint)inputSpan.Length || !{IsBoundaryWordChar}(inputSpan[index]));",
464+
]);
465+
466+
AddIsBoundaryWordCharHelper(requiredHelpers);
367467
}
368468
}
369469

370470
/// <summary>Adds the IsECMABoundary helper to the required helpers collection.</summary>
371471
private static void AddIsECMABoundaryHelper(Dictionary<string, string[]> requiredHelpers, bool checkOverflow)
372472
{
373-
const string IsECMABoundary = nameof(IsECMABoundary);
374473
if (!requiredHelpers.ContainsKey(IsECMABoundary))
375474
{
376475
string uncheckedKeyword = checkOverflow ? "unchecked" : "";
377476
requiredHelpers.Add(IsECMABoundary,
378477
[
379478
$"/// <summary>Determines whether the specified index is a boundary (ECMAScript).</summary>",
380479
$"[MethodImpl(MethodImplOptions.AggressiveInlining)]",
381-
$"internal static bool IsECMABoundary(ReadOnlySpan<char> inputSpan, int index)",
480+
$"internal static bool {IsECMABoundary}(ReadOnlySpan<char> inputSpan, int index)",
382481
$"{{",
383482
$" int indexMinus1 = index - 1;",
384-
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[indexMinus1])) !=",
385-
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && IsECMAWordChar(inputSpan[index]));",
386-
$"",
387-
$" static bool IsECMAWordChar(char ch) =>",
388-
$" char.IsAsciiLetterOrDigit(ch) ||",
389-
$" ch == '_' ||",
390-
$" ch == '\\u0130'; // latin capital letter I with dot above",
483+
$" return {uncheckedKeyword}((uint)indexMinus1 < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[indexMinus1])) !=",
484+
$" {uncheckedKeyword}((uint)index < (uint)inputSpan.Length && {IsECMABoundaryWordChar}(inputSpan[index]));",
391485
$"}}",
392486
]);
487+
488+
AddIsECMABoundaryWordCharHelper(requiredHelpers);
393489
}
394490
}
395491

@@ -3177,20 +3273,33 @@ void EmitBoundary(RegexNode node)
31773273
{
31783274
Debug.Assert(node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary or RegexNodeKind.ECMABoundary or RegexNodeKind.NonECMABoundary, $"Unexpected kind: {node.Kind}");
31793275

3276+
string negation = node.Kind is RegexNodeKind.Boundary or RegexNodeKind.ECMABoundary ? "!" : "";
3277+
31803278
string call;
3181-
if (node.Kind is RegexNodeKind.Boundary or RegexNodeKind.NonBoundary)
3182-
{
3183-
call = node.Kind is RegexNodeKind.Boundary ?
3184-
$"!{HelpersTypeName}.IsBoundary" :
3185-
$"{HelpersTypeName}.IsBoundary";
3186-
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
3187-
}
3188-
else
3279+
switch (node.Kind)
31893280
{
3190-
call = node.Kind is RegexNodeKind.ECMABoundary ?
3191-
$"!{HelpersTypeName}.IsECMABoundary" :
3192-
$"{HelpersTypeName}.IsECMABoundary";
3193-
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
3281+
case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary:
3282+
if (node.IsKnownPrecededByWordChar())
3283+
{
3284+
call = $"{negation}{HelpersTypeName}.{IsPostWordCharBoundary}";
3285+
AddIsPostWordCharBoundaryHelper(requiredHelpers, checkOverflow);
3286+
}
3287+
else if (node.IsKnownSucceededByWordChar())
3288+
{
3289+
call = $"{negation}{HelpersTypeName}.{IsPreWordCharBoundary}";
3290+
AddIsPreWordCharBoundaryHelper(requiredHelpers, checkOverflow);
3291+
}
3292+
else
3293+
{
3294+
call = $"{negation}{HelpersTypeName}.{IsBoundary}";
3295+
AddIsBoundaryHelper(requiredHelpers, checkOverflow);
3296+
}
3297+
break;
3298+
3299+
default:
3300+
call = $"{negation}{HelpersTypeName}.{IsECMABoundary}";
3301+
AddIsECMABoundaryHelper(requiredHelpers, checkOverflow);
3302+
break;
31943303
}
31953304

31963305
using (EmitBlock(writer, $"if ({call}(inputSpan, pos{(sliceStaticPos > 0 ? $" + {sliceStaticPos}" : "")}))"))

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ internal abstract class RegexCompiler
3535
private static MethodInfo MatchLengthMethod => field ??= RegexRunnerMethod("MatchLength");
3636
private static MethodInfo MatchIndexMethod => field ??= RegexRunnerMethod("MatchIndex");
3737
private static MethodInfo IsBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
38+
private static MethodInfo IsPreWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPreWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
39+
private static MethodInfo IsPostWordCharBoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsPostWordCharBoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
3840
private static MethodInfo IsWordCharMethod => field ??= RegexRunnerMethod("IsWordChar");
3941
private static MethodInfo IsECMABoundaryMethod => field ??= typeof(RegexRunner).GetMethod("IsECMABoundary", BindingFlags.NonPublic | BindingFlags.Static, [typeof(ReadOnlySpan<char>), typeof(int)])!;
4042
private static MethodInfo CrawlposMethod => field ??= RegexRunnerMethod("Crawlpos");
@@ -3050,25 +3052,41 @@ void EmitBoundary(RegexNode node)
30503052
}
30513053
switch (node.Kind)
30523054
{
3053-
case RegexNodeKind.Boundary:
3054-
Call(IsBoundaryMethod);
3055-
BrfalseFar(doneLabel);
3056-
break;
3057-
3058-
case RegexNodeKind.NonBoundary:
3059-
Call(IsBoundaryMethod);
3060-
BrtrueFar(doneLabel);
3061-
break;
3055+
case RegexNodeKind.Boundary or RegexNodeKind.NonBoundary:
3056+
if (node.IsKnownPrecededByWordChar())
3057+
{
3058+
Call(IsPostWordCharBoundaryMethod);
3059+
}
3060+
else if (node.IsKnownSucceededByWordChar())
3061+
{
3062+
Call(IsPreWordCharBoundaryMethod);
3063+
}
3064+
else
3065+
{
3066+
Call(IsBoundaryMethod);
3067+
}
30623068

3063-
case RegexNodeKind.ECMABoundary:
3064-
Call(IsECMABoundaryMethod);
3065-
BrfalseFar(doneLabel);
3069+
if (node.Kind is RegexNodeKind.Boundary)
3070+
{
3071+
BrfalseFar(doneLabel);
3072+
}
3073+
else
3074+
{
3075+
BrtrueFar(doneLabel);
3076+
}
30663077
break;
30673078

30683079
default:
3069-
Debug.Assert(node.Kind == RegexNodeKind.NonECMABoundary);
30703080
Call(IsECMABoundaryMethod);
3071-
BrtrueFar(doneLabel);
3081+
3082+
if (node.Kind is RegexNodeKind.ECMABoundary)
3083+
{
3084+
BrfalseFar(doneLabel);
3085+
}
3086+
else
3087+
{
3088+
BrtrueFar(doneLabel);
3089+
}
30723090
break;
30733091
}
30743092
}

0 commit comments

Comments
 (0)