Skip to content

Commit 48a162c

Browse files
authored
Fix longstanding regex interpreter bug around lazy loops with empty matches (#120872)
This has been an issue in the interpreter forever, as far as I can tell. We've had multiple issues over the years all flagging problems with different symptoms that stem from the same core problem. Fixes #43314 Fixes #58786 Fixes #63385 Fixes #111051 Fixes #114626 The problem came down to how the regex interpreter handled lazy quantifiers over expressions that can match the empty string. When the interpreter reaches one of these lazy loops, it uses an internal instruction called `Lazybranchmark` to manage entering and potentially looping the subexpression. To keep track of loop state and capture boundaries, the interpreter uses two internal stacks, a grouping stack, that tracks positions relevant to capturing groups (e.g. where a group started), and a backtracking stack, that tracks states that are needed if the engine has to go back and try a different match. The bug occurred in the case when the subpattern inside the lazy loop matches nothing. In this case, the interpreter unconditionally pushed a placeholder onto the grouping stack. If the rest of the pattern then succeeded without backtracking through this loop, that extra placeholder remained on the grouping stack. This polluted the capture bookkeeping: later parts of the pattern popped that placeholder, treating it as a real start position, and shifted captures to the wrong place. The fix is to stop pushing onto the grouping stack when the loop matches empty. Instead, the interpreter records two things on the backtracking stack: the old group boundary and a flag indicating whether the grouping stack needs to be popped later. If the interpreter ends up backtracking through this lazy loop, it checks the flag: if a grouping stack entry was added earlier, it pops it; if not, it leaves the grouping stack untouched. This keeps the grouping stack and backtracking stack in sync in both forward and backtracking paths. As a result, empty lazy loops no longer leave stray entries on the grouping stack. This also prevents the unbounded stack growth that previously caused overflows or hangs on some patterns involving nested lazy quantifiers.
1 parent cd66519 commit 48a162c

File tree

3 files changed

+60
-11
lines changed

3 files changed

+60
-11
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexInterpreter.cs

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -521,12 +521,10 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
521521
}
522522
else
523523
{
524-
// The inner expression found an empty match, so we'll go directly to 'back2' if we
525-
// backtrack. In this case, we need to push something on the stack, since back2 pops.
526-
// However, in the case of ()+? or similar, this empty match may be legitimate, so push the text
527-
// position associated with that empty match.
528-
StackPush(oldMarkPos);
529-
TrackPush2(StackPeek()); // Save old mark
524+
// The inner expression found an empty match. We'll go directly to BacktrackingSecond
525+
// if we backtrack. We do not touch the grouping stack here... instead, we record the
526+
// old mark and a "no-stack-pop" flag (0) on the backtracking stack.
527+
TrackPush2(oldMarkPos, 0);
530528
}
531529
}
532530
advance = 1;
@@ -541,7 +539,11 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
541539
// fails, we go to Lazybranchmark | RegexOpcode.Back2
542540
TrackPop(2);
543541
int pos = TrackPeek(1);
544-
TrackPush2(TrackPeek()); // Save old mark
542+
543+
// Store the previous mark. The second value (1) flags that a grouping stack frame
544+
// must be popped when backtracking, because a new frame will be pushed next.
545+
TrackPush2(TrackPeek(), 1);
546+
545547
StackPush(pos); // Make new mark
546548
runtextpos = pos; // Recall position
547549
Goto(Operand(0)); // Loop
@@ -551,9 +553,17 @@ private bool TryMatchAtCurrentPosition(ReadOnlySpan<char> inputSpan)
551553
case RegexOpcode.Lazybranchmark | RegexOpcode.BacktrackingSecond:
552554
// The lazy loop has failed. We'll do a true backtrack and
553555
// start over before the lazy loop.
554-
StackPop();
555-
TrackPop();
556-
StackPush(TrackPeek()); // Recall old mark
556+
int needsPop = runtrack![runtrackpos]; // flag: 0 or 1
557+
int oldMark = runtrack[runtrackpos + 1]; // saved old mark
558+
runtrackpos += 2; // consume both payload ints
559+
if (needsPop != 0)
560+
{
561+
// We pushed on the grouping stack in the Backtracking arm; balance it now.
562+
StackPop();
563+
}
564+
565+
// Restore the old mark and backtrack
566+
StackPush(oldMark);
557567
break;
558568

559569
case RegexOpcode.Setcount:

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,26 @@ public static IEnumerable<object[]> Match_MemberData()
329329
yield return (@"(b|a|aa)((?:aa)+?)+?$", "aaaaaaaa", RegexOptions.None, 0, 8, true, "aaaaaaaa");
330330
yield return (@"(|a|aa)(((?:aa)+?)+?|aaaaab)\w$", "aaaaaabc", RegexOptions.None, 0, 8, true, "aaaaaabc");
331331

332+
// Lazy loops with empty matches
333+
if (!PlatformDetection.IsNetFramework)
334+
{
335+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/111051
336+
yield return (@"(.)+()+?b", "xyzb", RegexOptions.None, 0, 4, true, "xyzb");
337+
yield return (@"^(.)+()+?b", "xyzb", RegexOptions.None, 0, 4, true, "xyzb");
338+
339+
if (!RegexHelpers.IsNonBacktracking(engine))
340+
{
341+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/58786
342+
yield return (@"(?<!a*(?:a?)+?)", @"x", RegexOptions.None, 0, 1, false, "");
343+
344+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/114626
345+
yield return (@"(?>(-*)+?-*)$", "abc", RegexOptions.None, 0, 3, true, "");
346+
347+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/63385
348+
yield return (@"(^+?)?()", "1", RegexOptions.None, 0, 1, true, "");
349+
}
350+
}
351+
332352
// Nested loops
333353
yield return (@"(abcd*)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde");
334354
yield return (@"(abcd*?)+e", "abcde", RegexOptions.None, 0, 5, true, "abcde");

src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.MultipleMatches.Tests.cs

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,9 +452,18 @@ public static IEnumerable<object[]> Matches_TestData()
452452
new CaptureData("x", 3, 1),
453453
}
454454
};
455+
456+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/111051
457+
yield return new object[]
458+
{
459+
engine, @"anyexpress1(?<=(.(any express|(any express)*)+?)anyexpress1)", "anystring anyexpress1", RegexOptions.None, new[]
460+
{
461+
new CaptureData("anyexpress1", 10, 11),
462+
}
463+
};
455464
}
456465

457-
// Fails on .NET Framework: [ActiveIssue("https://github.com/dotnet/runtime/issues/62094")]
466+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/62094
458467
yield return new object[]
459468
{
460469
engine, @"(?:){93}", "x", RegexOptions.None, new[]
@@ -463,6 +472,16 @@ public static IEnumerable<object[]> Matches_TestData()
463472
new CaptureData("", 1, 0)
464473
}
465474
};
475+
476+
// Fails on .NET Framework: https://github.com/dotnet/runtime/issues/43314
477+
yield return new object[]
478+
{
479+
engine, @"(?:(?:0?)+?(?:a?)+?)?", "0a", RegexOptions.None, new[]
480+
{
481+
new CaptureData("0a", 0, 2),
482+
new CaptureData("", 2, 0),
483+
}
484+
};
466485
#endif
467486
}
468487
}

0 commit comments

Comments
 (0)