@@ -140,12 +140,10 @@ class RegExpRoot extends RegExpTerm {
140
140
predicate isRelevant ( ) {
141
141
// there is at least one repetition
142
142
getRoot ( any ( InfiniteRepetitionQuantifier q ) ) = this and
143
- // there are no lookbehinds
144
- not exists ( RegExpLookbehind lbh | getRoot ( lbh ) = this ) and
145
143
// is actually used as a RegExp
146
- this . isUsedAsRegExp ( ) // and
147
- // // pragmatic performance optimization: ignore minified files.
148
- // not getRootTerm().getParent().(Expr).getTopLevel().isMinified( )
144
+ isUsedAsRegExp ( ) and
145
+ // not excluded for library specific reasons
146
+ not isExcluded ( getRootTerm ( ) .getParent ( ) )
149
147
}
150
148
}
151
149
@@ -156,38 +154,68 @@ private class RegexpCharacterConstant extends RegExpConstant {
156
154
RegexpCharacterConstant ( ) { this .isCharacter ( ) }
157
155
}
158
156
157
+ /**
158
+ * A regexp term that is relevant for this ReDoS analysis.
159
+ */
160
+ class RelevantRegExpTerm extends RegExpTerm {
161
+ RelevantRegExpTerm ( ) { getRoot ( this ) .isRelevant ( ) }
162
+ }
163
+
159
164
/**
160
165
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
166
+ * The string representation includes which flags are used with the regular expression.
161
167
*
162
168
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
163
169
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
164
170
*/
165
- private predicate isCanonicalTerm ( RegExpTerm term , string str ) {
171
+ private predicate isCanonicalTerm ( RelevantRegExpTerm term , string str ) {
166
172
term =
167
- rank [ 1 ] ( RegExpTerm t , Location loc , File file |
173
+ min ( RelevantRegExpTerm t , Location loc , File file |
168
174
loc = t .getLocation ( ) and
169
175
file = t .getFile ( ) and
170
- str = t .getRawValue ( )
176
+ str = t .getRawValue ( ) + "|" + getCanonicalizationFlags ( t . getRootTerm ( ) )
171
177
|
172
178
t order by t .getFile ( ) .getRelativePath ( ) , loc .getStartLine ( ) , loc .getStartColumn ( )
173
179
)
174
180
}
175
181
182
+ /**
183
+ * Gets a string reperesentation of the flags used with the regular expression.
184
+ * Only the flags that are relevant for the canonicalization are included.
185
+ */
186
+ string getCanonicalizationFlags ( RegExpTerm root ) {
187
+ root .isRootTerm ( ) and
188
+ ( if RegExpFlags:: isIgnoreCase ( root ) then result = "i" else result = "" )
189
+ }
190
+
176
191
/**
177
192
* An abstract input symbol, representing a set of concrete characters.
178
193
*/
179
194
private newtype TInputSymbol =
180
195
/** An input symbol corresponding to character `c`. */
181
196
Char ( string c ) {
182
- c = any ( RegexpCharacterConstant cc | getRoot ( cc ) .isRelevant ( ) ) .getValue ( ) .charAt ( _)
197
+ c =
198
+ any ( RegexpCharacterConstant cc |
199
+ cc instanceof RelevantRegExpTerm and
200
+ not RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
201
+ ) .getValue ( ) .charAt ( _)
202
+ or
203
+ // normalize everything to lower case if the regexp is case insensitive
204
+ c =
205
+ any ( RegexpCharacterConstant cc , string char |
206
+ cc instanceof RelevantRegExpTerm and
207
+ RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) ) and
208
+ char = cc .getValue ( ) .charAt ( _)
209
+ |
210
+ char .toLowerCase ( )
211
+ )
183
212
} or
184
213
/**
185
214
* An input symbol representing all characters matched by
186
215
* a (non-universal) character class that has string representation `charClassString`.
187
216
*/
188
217
CharClass ( string charClassString ) {
189
- exists ( RegExpTerm term | term .getRawValue ( ) = charClassString | getRoot ( term ) .isRelevant ( ) ) and
190
- exists ( RegExpTerm recc | isCanonicalTerm ( recc , charClassString ) |
218
+ exists ( RelevantRegExpTerm recc | isCanonicalTerm ( recc , charClassString ) |
191
219
recc instanceof RegExpCharacterClass and
192
220
not recc .( RegExpCharacterClass ) .isUniversalClass ( )
193
221
or
@@ -254,7 +282,7 @@ class InputSymbol extends TInputSymbol {
254
282
/**
255
283
* An abstract input symbol that represents a character class.
256
284
*/
257
- abstract private class CharacterClass extends InputSymbol {
285
+ abstract class CharacterClass extends InputSymbol {
258
286
/**
259
287
* Gets a character that is relevant for intersection-tests involving this
260
288
* character class.
@@ -277,7 +305,7 @@ abstract private class CharacterClass extends InputSymbol {
277
305
/**
278
306
* Gets a character matched by this character class.
279
307
*/
280
- string choose ( ) { result = this . getARelevantChar ( ) and this . matches ( result ) }
308
+ string choose ( ) { result = getARelevantChar ( ) and matches ( result ) }
281
309
}
282
310
283
311
/**
@@ -289,6 +317,19 @@ private module CharacterClasses {
289
317
*/
290
318
pragma [ noinline]
291
319
predicate hasChildThatMatches ( RegExpCharacterClass cc , string char ) {
320
+ if RegExpFlags:: isIgnoreCase ( cc .getRootTerm ( ) )
321
+ then
322
+ // normalize everything to lower case if the regexp is case insensitive
323
+ exists ( string c | hasChildThatMatchesIgnoringCasingFlags ( cc , c ) | char = c .toLowerCase ( ) )
324
+ else hasChildThatMatchesIgnoringCasingFlags ( cc , char )
325
+ }
326
+
327
+ /**
328
+ * Holds if the character class `cc` has a child (constant or range) that matches `char`.
329
+ * Ignores whether the character class is inside a regular expression that has the ignore case flag.
330
+ */
331
+ pragma [ noinline]
332
+ predicate hasChildThatMatchesIgnoringCasingFlags ( RegExpCharacterClass cc , string char ) {
292
333
exists ( getCanonicalCharClass ( cc ) ) and
293
334
exists ( RegExpTerm child | child = cc .getAChild ( ) |
294
335
char = child .( RegexpCharacterConstant ) .getValue ( )
@@ -433,7 +474,7 @@ private module CharacterClasses {
433
474
char = "0123456789" .charAt ( _)
434
475
or
435
476
clazz = "s" and
436
- char = [ " " , "\t" , "\r" , "\n" , 11 .toUnicode ( ) , 12 .toUnicode ( ) ] // 11.toUnicode() = \v, 12.toUnicode() = \f'
477
+ char = [ " " , "\t" , "\r" , "\n" , 11 .toUnicode ( ) , 12 .toUnicode ( ) ] // 11.toUnicode() = \v, 12.toUnicode() = \f
437
478
or
438
479
clazz = "w" and
439
480
char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_" .charAt ( _)
@@ -477,7 +518,7 @@ private module CharacterClasses {
477
518
result = [ "0" , "9" ]
478
519
or
479
520
cc .getValue ( ) = "s" and
480
- result = [ " " ]
521
+ result = " "
481
522
or
482
523
cc .getValue ( ) = "w" and
483
524
result = [ "a" , "Z" , "_" , "0" , "9" ]
@@ -490,7 +531,7 @@ private module CharacterClasses {
490
531
result = "9"
491
532
or
492
533
cc .getValue ( ) = "s" and
493
- result = [ " " ]
534
+ result = " "
494
535
or
495
536
cc .getValue ( ) = "w" and
496
537
result = "a"
@@ -604,7 +645,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
604
645
/**
605
646
* Gets a state the NFA may be in after matching `t`.
606
647
*/
607
- private State after ( RegExpTerm t ) {
648
+ State after ( RegExpTerm t ) {
608
649
exists ( RegExpAlt alt | t = alt .getAChild ( ) | result = after ( alt ) )
609
650
or
610
651
exists ( RegExpSequence seq , int i | t = seq .getChild ( i ) |
@@ -633,7 +674,14 @@ private State after(RegExpTerm t) {
633
674
predicate delta ( State q1 , EdgeLabel lbl , State q2 ) {
634
675
exists ( RegexpCharacterConstant s , int i |
635
676
q1 = Match ( s , i ) and
636
- lbl = Char ( s .getValue ( ) .charAt ( i ) ) and
677
+ (
678
+ not RegExpFlags:: isIgnoreCase ( s .getRootTerm ( ) ) and
679
+ lbl = Char ( s .getValue ( ) .charAt ( i ) )
680
+ or
681
+ // normalize everything to lower case if the regexp is case insensitive
682
+ RegExpFlags:: isIgnoreCase ( s .getRootTerm ( ) ) and
683
+ exists ( string c | c = s .getValue ( ) .charAt ( i ) | lbl = Char ( c .toLowerCase ( ) ) )
684
+ ) and
637
685
(
638
686
q2 = Match ( s , i + 1 )
639
687
or
@@ -643,20 +691,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
643
691
)
644
692
or
645
693
exists ( RegExpDot dot | q1 = before ( dot ) and q2 = after ( dot ) |
646
- if dot .getLiteral ( ) . isDotAll ( ) then lbl = Any ( ) else lbl = Dot ( )
694
+ if RegExpFlags :: isDotAll ( dot .getRootTerm ( ) ) then lbl = Any ( ) else lbl = Dot ( )
647
695
)
648
696
or
649
697
exists ( RegExpCharacterClass cc |
650
698
cc .isUniversalClass ( ) and q1 = before ( cc ) and lbl = Any ( ) and q2 = after ( cc )
651
699
or
652
700
q1 = before ( cc ) and
653
- lbl = CharClass ( cc .getRawValue ( ) ) and
701
+ lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc . getRootTerm ( ) ) ) and
654
702
q2 = after ( cc )
655
703
)
656
704
or
657
705
exists ( RegExpCharacterClassEscape cc |
658
706
q1 = before ( cc ) and
659
- lbl = CharClass ( cc .getRawValue ( ) ) and
707
+ lbl = CharClass ( cc .getRawValue ( ) + "|" + getCanonicalizationFlags ( cc . getRootTerm ( ) ) ) and
660
708
q2 = after ( cc )
661
709
)
662
710
or
@@ -729,16 +777,27 @@ RegExpRoot getRoot(RegExpTerm term) {
729
777
result = getRoot ( term .getParent ( ) )
730
778
}
731
779
732
- private newtype TState =
733
- Match ( RegExpTerm t , int i ) {
734
- getRoot ( t ) .isRelevant ( ) and
735
- (
736
- i = 0
737
- or
738
- exists ( t .( RegexpCharacterConstant ) .getValue ( ) .charAt ( i ) )
739
- )
780
+ /**
781
+ * A state in the NFA.
782
+ */
783
+ newtype TState =
784
+ /**
785
+ * A state representing that the NFA is about to match a term.
786
+ * `i` is used to index into multi-char literals.
787
+ */
788
+ Match ( RelevantRegExpTerm t , int i ) {
789
+ i = 0
790
+ or
791
+ exists ( t .( RegexpCharacterConstant ) .getValue ( ) .charAt ( i ) )
740
792
} or
793
+ /**
794
+ * An accept state, where exactly the given input string is accepted.
795
+ */
741
796
Accept ( RegExpRoot l ) { l .isRelevant ( ) } or
797
+ /**
798
+ * An accept state, where the given input string, or any string that has this
799
+ * string as a prefix, is accepted.
800
+ */
742
801
AcceptAnySuffix ( RegExpRoot l ) { l .isRelevant ( ) }
743
802
744
803
/**
@@ -851,29 +910,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
851
910
result = Any ( )
852
911
}
853
912
913
+ /**
914
+ * Holds if `state` is a start state.
915
+ */
916
+ predicate isStartState ( State state ) {
917
+ state = mkMatch ( any ( RegExpRoot r ) )
918
+ or
919
+ exists ( RegExpCaret car | state = after ( car ) )
920
+ }
921
+
854
922
/**
855
923
* Predicates for constructing a prefix string that leads to a given state.
856
924
*/
857
925
private module PrefixConstruction {
858
- /**
859
- * Holds if `state` starts the string matched by the regular expression.
860
- */
861
- private predicate isStartState ( State state ) {
862
- state instanceof StateInPumpableRegexp and
863
- (
864
- state = Match ( any ( RegExpRoot r ) , _)
865
- or
866
- exists ( RegExpCaret car | state = after ( car ) )
867
- )
868
- }
869
-
870
926
/**
871
927
* Holds if `state` is the textually last start state for the regular expression.
872
928
*/
873
929
private predicate lastStartState ( State state ) {
874
930
exists ( RegExpRoot root |
875
931
state =
876
- max ( State s , Location l |
932
+ max ( StateInPumpableRegexp s , Location l |
877
933
isStartState ( s ) and getRoot ( s .getRepr ( ) ) = root and l = s .getRepr ( ) .getLocation ( )
878
934
|
879
935
s
@@ -1173,7 +1229,6 @@ private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
1173
1229
* `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found).
1174
1230
*/
1175
1231
predicate hasReDoSResult ( RegExpTerm t , string pump , State s , string prefixMsg ) {
1176
- not t .getRegExp ( ) .hasFreeSpacingFlag ( ) and // exclude free-spacing mode regexes
1177
1232
isReDoSAttackable ( t , pump , s ) and
1178
1233
(
1179
1234
prefixMsg = "starting with '" + escape ( PrefixConstruction:: prefix ( s ) ) + "' and " and
0 commit comments