Skip to content

Commit b5f8e98

Browse files
authored
Improve boundary handling in atomic tests (#118191)
These were set up to require what comes after the boundary to also be disjoint from its predecessor being tested. But that's not necessary; the boundary itself is sufficient to determine atomicity.
1 parent 12a954f commit b5f8e98

File tree

2 files changed

+16
-12
lines changed

2 files changed

+16
-12
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2350,15 +2350,15 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
23502350
case RegexNodeKind.Multi when node.Ch != subsequent.Str![0]:
23512351
case RegexNodeKind.End:
23522352
case RegexNodeKind.EndZ or RegexNodeKind.Eol when node.Ch != '\n':
2353+
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
2354+
case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
2355+
case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
2356+
case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
23532357
return true;
23542358

23552359
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && node.Ch != subsequent.Ch:
23562360
case RegexNodeKind.Notonelazy or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic when subsequent.M == 0 && node.Ch == subsequent.Ch:
23572361
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(node.Ch, subsequent.Str!):
2358-
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsBoundaryWordChar(node.Ch):
2359-
case RegexNodeKind.NonBoundary when node.M > 0 && !RegexCharClass.IsBoundaryWordChar(node.Ch):
2360-
case RegexNodeKind.ECMABoundary when node.M > 0 && RegexCharClass.IsECMAWordChar(node.Ch):
2361-
case RegexNodeKind.NonECMABoundary when node.M > 0 && !RegexCharClass.IsECMAWordChar(node.Ch):
23622362
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
23632363
break;
23642364

@@ -2397,14 +2397,14 @@ private static bool CanBeMadeAtomic(RegexNode node, RegexNode subsequent, bool i
23972397
case RegexNodeKind.Multi when !RegexCharClass.CharInClass(subsequent.Str![0], node.Str!):
23982398
case RegexNodeKind.End:
23992399
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !RegexCharClass.CharInClass('\n', node.Str!):
2400-
return true;
2401-
2402-
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
2403-
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
24042400
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(node.Str!):
24052401
case RegexNodeKind.NonBoundary when node.M > 0 && node.Str is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass:
24062402
case RegexNodeKind.ECMABoundary when node.M > 0 && node.Str is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass:
24072403
case RegexNodeKind.NonECMABoundary when node.M > 0 && node.Str is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass:
2404+
return true;
2405+
2406+
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !RegexCharClass.CharInClass(subsequent.Ch, node.Str!):
2407+
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !RegexCharClass.MayOverlap(node.Str!, subsequent.Str!):
24082408
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
24092409
break;
24102410

@@ -2444,14 +2444,14 @@ bool MayOverlapStartingOrEndingSet(string set) =>
24442444
case RegexNodeKind.Multi when !CharInStartingOrEndingSet(subsequent.Str![0]):
24452445
case RegexNodeKind.End:
24462446
case RegexNodeKind.EndZ or RegexNodeKind.Eol when !CharInStartingOrEndingSet('\n'):
2447-
return true;
2448-
2449-
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch):
2450-
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!):
24512447
case RegexNodeKind.Boundary when node.M > 0 && RegexCharClass.IsKnownWordClassSubset(loopStartingSet) && RegexCharClass.IsKnownWordClassSubset(loopEndingSet):
24522448
case RegexNodeKind.NonBoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotWordClass or RegexCharClass.NotDigitClass):
24532449
case RegexNodeKind.ECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass) && (loopEndingSet is RegexCharClass.ECMAWordClass or RegexCharClass.ECMADigitClass):
24542450
case RegexNodeKind.NonECMABoundary when node.M > 0 && (loopStartingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass) && (loopEndingSet is RegexCharClass.NotECMAWordClass or RegexCharClass.NotDigitClass):
2451+
return true;
2452+
2453+
case RegexNodeKind.Onelazy or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic when subsequent.M == 0 && !CharInStartingOrEndingSet(subsequent.Ch):
2454+
case RegexNodeKind.Setlazy or RegexNodeKind.Setloop or RegexNodeKind.Setloopatomic when subsequent.M == 0 && !MayOverlapStartingOrEndingSet(subsequent.Str!):
24552455
// The loop can be made atomic based on this subsequent node, but we'll need to evaluate the next one as well.
24562456
break;
24572457

src/libraries/System.Text.RegularExpressions/tests/UnitTests/RegexReductionTests.cs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,8 +381,12 @@ public class RegexReductionTests
381381
[InlineData("(a+)b", "((?>a+))b")]
382382
[InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")]
383383
[InlineData("\\w+\\b", "(?>\\w+)\\b")]
384+
[InlineData("\\w+\\ba", "(?>\\w+)\\ba")]
385+
[InlineData("\\w+\\b\\w", "(?>\\w+)\\b\\w")]
384386
[InlineData("\\d+\\b", "(?>\\d+)\\b")]
385387
[InlineData("\\W+\\B", "(?>\\W+)\\B")]
388+
[InlineData("\\W+\\B#", "(?>\\W+)\\B#")]
389+
[InlineData("\\W+\\B\\W", "(?>\\W+)\\B\\W")]
386390
[InlineData("\\D+\\B", "(?>\\D+)\\B")]
387391
[InlineData(@"[0-9]+\b", @"(?>[0-9]+)\b")]
388392
[InlineData(@"[a-z]+\b", @"(?>[a-z]+)\b")]

0 commit comments

Comments
 (0)