@@ -893,10 +893,7 @@ private RegexNode ReduceLoops()
893
893
894
894
// If the Loop or Lazyloop now only has one child node and its a Set, One, or Notone,
895
895
// reduce to just Setloop/lazy, Oneloop/lazy, or Notoneloop/lazy. The parser will
896
- // generally have only produced the latter, but other reductions could have exposed
897
- // this. We can also reduce or eliminate certain loops that are nops, e.g.
898
- // a loop with a minimum of 0 that wraps a zero-width assertion is either asserting something
899
- // or not, and is thus useless.
896
+ // generally have only produced the latter, but other reductions could have exposed this.
900
897
if ( u . ChildCount ( ) == 1 )
901
898
{
902
899
RegexNode child = u . Child ( 0 ) ;
@@ -910,14 +907,27 @@ private RegexNode ReduceLoops()
910
907
break ;
911
908
912
909
case RegexNodeKind . Empty :
913
- case RegexNodeKind . PositiveLookaround or RegexNodeKind . NegativeLookaround or
910
+ // A loop around an empty is itself empty, regardless of iteration counts.
911
+ u = child ;
912
+ break ;
913
+
914
+ case RegexNodeKind . PositiveLookaround when ContainsKind ( child , [ RegexNodeKind . Capture ] ) is false :
915
+ case RegexNodeKind . NegativeLookaround or
914
916
RegexNodeKind . Beginning or RegexNodeKind . Start or
915
917
RegexNodeKind . Bol or RegexNodeKind . Eol or
916
918
RegexNodeKind . End or RegexNodeKind . EndZ or
917
919
RegexNodeKind . Boundary or RegexNodeKind . ECMABoundary or
918
- RegexNodeKind . NonBoundary or RegexNodeKind . NonECMABoundary
919
- when u . M == 0 :
920
- u = new RegexNode ( RegexNodeKind . Empty , Options ) ;
920
+ RegexNodeKind . NonBoundary or RegexNodeKind . NonECMABoundary :
921
+ // A loop around (most) zero-width assertions can also be reduced. If it has a lower bound of 0,
922
+ // then it's either asserting something or not, and is thus useless and replaceable by empty.
923
+ // If it has a lower bound > 0, then the contents are still needed, but the loop isn't, since
924
+ // it's non-consuming and thus any more repetitions than 1 are redundant. The one zero-width assertion
925
+ // that can't be handled in this way is a PositiveLookaround, because it might contain capture groups
926
+ // with captures that must persist past the lookaround (in contrast, negative lookarounds undo all
927
+ // captures); if it were to be removed, it could affect both subsequent backreferences as well as access
928
+ // to capture information in the resulting Match. Thus, we can only transform a PositiveLookaround in
929
+ // this manner if it doesn't contain any captures.
930
+ u = u . M == 0 ? new RegexNode ( RegexNodeKind . Empty , Options ) : child ;
921
931
break ;
922
932
}
923
933
}
@@ -2067,7 +2077,7 @@ private RegexNode ReduceLookaround()
2067
2077
// Captures inside of negative lookarounds are undone after the lookaround. Thus, if there's nothing
2068
2078
// inside of the negative lookaround that needs that capture group (namely a backreference), we can
2069
2079
// remove the capture.
2070
- if ( Kind is RegexNodeKind . NegativeLookaround && ContainsBackreference ( Child ( 0 ) ) is false )
2080
+ if ( Kind is RegexNodeKind . NegativeLookaround && ContainsKind ( Child ( 0 ) , [ RegexNodeKind . Backreference , RegexNodeKind . BackreferenceConditional ] ) is false )
2071
2081
{
2072
2082
if ( RemoveCaptures ( this , 0 ) )
2073
2083
{
@@ -2140,26 +2150,32 @@ RegexNodeKind.Beginning or RegexNodeKind.Start or
2140
2150
RegexNodeKind . Bol or RegexNodeKind . Eol or
2141
2151
RegexNodeKind . End or RegexNodeKind . EndZ or
2142
2152
RegexNodeKind . Boundary or RegexNodeKind . ECMABoundary or
2143
- RegexNodeKind . NonBoundary or RegexNodeKind . NonECMABoundary ;
2153
+ RegexNodeKind . NonBoundary or RegexNodeKind . NonECMABoundary or
2154
+ RegexNodeKind . UpdateBumpalong ;
2144
2155
2145
- /// <summary>Gets whether the node contains a backreference anywhere in its tree.</summary>
2146
- private static bool ? ContainsBackreference ( RegexNode node )
2156
+ /// <summary>Gets whether the node contains any of the specified kinds anywhere in its tree.</summary>
2157
+ /// <returns><see langword="true"/> if it does, <see langword="false"/> if it does't, and <see langword="null"/> if it can't be determined.</returns>
2158
+ private static bool ? ContainsKind ( RegexNode node , ReadOnlySpan < RegexNodeKind > kinds )
2147
2159
{
2148
- if ( node . Kind is RegexNodeKind . Backreference or RegexNodeKind . BackreferenceConditional )
2160
+ foreach ( RegexNodeKind kind in kinds )
2149
2161
{
2150
- return true ;
2162
+ if ( node . Kind == kind )
2163
+ {
2164
+ return true ;
2165
+ }
2151
2166
}
2152
2167
2153
2168
if ( ! StackHelper . TryEnsureSufficientExecutionStack ( ) )
2154
2169
{
2155
- // If we can't recur further, just stop optimizing.
2170
+ // If we can't recur further, just stop optimizing. We need to return null to signal
2171
+ // that the result can't be trusted.
2156
2172
return null ;
2157
2173
}
2158
2174
2159
2175
int childCount = node . ChildCount ( ) ;
2160
2176
for ( int i = 0 ; i < childCount ; i ++ )
2161
2177
{
2162
- if ( ContainsBackreference ( node . Child ( i ) ) is true )
2178
+ if ( ContainsKind ( node . Child ( i ) , kinds ) is true )
2163
2179
{
2164
2180
return true ;
2165
2181
}
@@ -2796,25 +2812,10 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
2796
2812
// Skip over empty nodes, as they're pure nops. They would ideally have been optimized away,
2797
2813
// but can still remain in some situations.
2798
2814
}
2799
- else if ( consumeZeroWidthNodes &&
2800
- // anchors
2801
- child . Kind is RegexNodeKind . Beginning or
2802
- RegexNodeKind . Bol or
2803
- RegexNodeKind . Start or
2804
- // boundaries
2805
- RegexNodeKind . Boundary or
2806
- RegexNodeKind . ECMABoundary or
2807
- RegexNodeKind . NonBoundary or
2808
- RegexNodeKind . NonECMABoundary or
2809
- // lookarounds
2810
- RegexNodeKind . NegativeLookaround or
2811
- RegexNodeKind . PositiveLookaround or
2812
- // logic
2813
- RegexNodeKind . UpdateBumpalong )
2815
+ else if ( consumeZeroWidthNodes && IsZeroWidthAssertion ( child . Kind ) )
2814
2816
{
2815
- // Skip over zero-width nodes that might be reasonable at the beginning of or within a substring.
2816
- // We can only do these if consumeZeroWidthNodes is true, as otherwise we'd be producing a string that
2817
- // may not fully represent the semantics of this portion of the pattern.
2817
+ // Skip over zero-width nodes. We can only do these if consumeZeroWidthNodes is true, as otherwise we'd
2818
+ // be producing a string that may not fully represent the semantics of this portion of the pattern.
2818
2819
}
2819
2820
else
2820
2821
{
0 commit comments