Skip to content

Commit f5a1a12

Browse files
committed
support case insensitive regexps in the ReDoS queries
1 parent bf15b18 commit f5a1a12

File tree

9 files changed

+233
-20
lines changed

9 files changed

+233
-20
lines changed

config/identical-files.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,4 +462,4 @@
462462
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
463463
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
464464
]
465-
}
465+
}

javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 72 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
164164

165165
/**
166166
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
167+
* The string representation includes which flags are used with the regular expression.
167168
*
168169
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
169170
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,54 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
173174
min(RelevantRegExpTerm t, Location loc, File file |
174175
loc = t.getLocation() and
175176
file = t.getFile() and
176-
str = t.getRawValue()
177+
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
177178
|
178179
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
179180
)
180181
}
181182

183+
/**
184+
* Gets a string reperesentation of the flags used with the regular expression.
185+
* Only the flags that are relevant for the canonicalization are included.
186+
*/
187+
string getCanonicalizationFlags(RegExpTerm root) {
188+
root.isRootTerm() and
189+
(
190+
RegExpFlags::isIgnoreCase(root) and
191+
result = "i"
192+
or
193+
not RegExpFlags::isIgnoreCase(root) and
194+
result = ""
195+
)
196+
}
197+
182198
/**
183199
* An abstract input symbol, representing a set of concrete characters.
184200
*/
185201
private newtype TInputSymbol =
186202
/** An input symbol corresponding to character `c`. */
187203
Char(string c) {
188-
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
204+
c =
205+
any(RegexpCharacterConstant cc |
206+
cc instanceof RelevantRegExpTerm and
207+
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
208+
).getValue().charAt(_)
209+
or
210+
// normalize to lower case if the regexp is case insensitive
211+
c =
212+
any(RegexpCharacterConstant cc, string char |
213+
cc instanceof RelevantRegExpTerm and
214+
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
215+
char = cc.getValue().charAt(_)
216+
|
217+
char.toLowerCase()
218+
)
189219
} or
190220
/**
191221
* An input symbol representing all characters matched by
192222
* a (non-universal) character class that has string representation `charClassString`.
193223
*/
194224
CharClass(string charClassString) {
195-
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
196225
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
197226
recc instanceof RegExpCharacterClass and
198227
not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +322,19 @@ private module CharacterClasses {
293322
*/
294323
pragma[noinline]
295324
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
325+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
326+
then
327+
// normalize everything to lower case if the regexp is case insensitive
328+
exists(string c | hasChildThatMatchesIgnoringCasing(cc, c) | char = c.toLowerCase())
329+
else hasChildThatMatchesIgnoringCasing(cc, char)
330+
}
331+
332+
/**
333+
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
334+
* Ignores whether the character class is inside a regular expression that ignores casing.
335+
*/
336+
pragma[noinline]
337+
predicate hasChildThatMatchesIgnoringCasing(RegExpCharacterClass cc, string char) {
296338
exists(getCanonicalCharClass(cc)) and
297339
exists(RegExpTerm child | child = cc.getAChild() |
298340
char = child.(RegexpCharacterConstant).getValue()
@@ -508,7 +550,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
508550
/**
509551
* Gets a state the NFA may be in after matching `t`.
510552
*/
511-
private State after(RegExpTerm t) {
553+
State after(RegExpTerm t) {
512554
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
513555
or
514556
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -537,7 +579,14 @@ private State after(RegExpTerm t) {
537579
predicate delta(State q1, EdgeLabel lbl, State q2) {
538580
exists(RegexpCharacterConstant s, int i |
539581
q1 = Match(s, i) and
540-
lbl = Char(s.getValue().charAt(i)) and
582+
(
583+
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
584+
lbl = Char(s.getValue().charAt(i))
585+
or
586+
// normalizing to lower case if ignorecase flag is set
587+
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
588+
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
589+
) and
541590
(
542591
q2 = Match(s, i + 1)
543592
or
@@ -547,20 +596,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
547596
)
548597
or
549598
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
550-
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
599+
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
551600
)
552601
or
553602
exists(RegExpCharacterClass cc |
554603
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
555604
or
556605
q1 = before(cc) and
557-
lbl = CharClass(cc.getRawValue()) and
606+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
558607
q2 = after(cc)
559608
)
560609
or
561610
exists(RegExpCharacterClassEscape cc |
562611
q1 = before(cc) and
563-
lbl = CharClass(cc.getRawValue()) and
612+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
564613
q2 = after(cc)
565614
)
566615
or
@@ -627,13 +676,27 @@ RegExpRoot getRoot(RegExpTerm term) {
627676
result = getRoot(term.getParent())
628677
}
629678

630-
private newtype TState =
679+
/**
680+
* A state in the NFA.
681+
*/
682+
newtype TState =
683+
/**
684+
* A state representing that the NFA is about to match a term.
685+
* `i` is used to index into multi-char literals.
686+
*/
631687
Match(RelevantRegExpTerm t, int i) {
632688
i = 0
633689
or
634690
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
635691
} or
692+
/**
693+
* An accept state, where exactly the given input string is accepted.
694+
*/
636695
Accept(RegExpRoot l) { l.isRelevant() } or
696+
/**
697+
* An accept state, where the given input string, or any string that has this
698+
* string as a prefix, is accepted.
699+
*/
637700
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
638701

639702
/**

javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,41 @@ import javascript
1212
* For javascript we make the pragmatic performance optimization to ignore minified files.
1313
*/
1414
predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
15+
16+
/**
17+
* A module containing predicates for determining which flags a regular expression have.
18+
*/
19+
module RegExpFlags {
20+
/**
21+
* Holds if `root` has the `i` flag for case-insensitive matching.
22+
*/
23+
predicate isIgnoreCase(RegExpTerm root) {
24+
root.isRootTerm() and
25+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
26+
RegExp::isIgnoreCase(node.getFlags())
27+
)
28+
}
29+
30+
/**
31+
* Gets the flags for `root`, or the empty string if `root` has no flags.
32+
*/
33+
string getFlags(RegExpTerm root) {
34+
root.isRootTerm() and
35+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
36+
result = node.getFlags()
37+
or
38+
not exists(node.getFlags()) and
39+
result = ""
40+
)
41+
}
42+
43+
/**
44+
* Holds if `root` has the `s` flag for multi-line matching.
45+
*/
46+
predicate isDotAll(RegExpTerm root) {
47+
root.isRootTerm() and
48+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
49+
RegExp::isDotAll(node.getFlags())
50+
)
51+
}
52+
}

javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -502,3 +502,7 @@
502502
| tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
503503
| tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
504504
| tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
505+
| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
506+
| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
507+
| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
508+
| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |

javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,5 @@
178178
| tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
179179
| tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
180180
| tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
181+
| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
182+
| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |

javascript/ql/test/query-tests/Performance/ReDoS/tst.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/
376376

377377
// GOOD - but we spuriously conclude that a rejecting suffix exists.
378378
var good44 = /([\s\S]*)+(?=$)/;
379-
var good45 = /([\s\S]*)+(?=$|y)/;
379+
var good45 = /([\s\S]*)+(?=$|y)/;
380+
381+
var good46 = /(foo|FOO)*bar/;
382+
var bad91 = /(foo|FOO)*bar/i;
383+
384+
var good47 = /([AB]|[ab])*C/;
385+
var bad92 = /([DE]|[de])*F/i;

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,14 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
6161

6262
predicate isDotAll() { re.getAMode() = "DOTALL" }
6363

64+
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
65+
66+
string getFlags() {
67+
not exists(re.getAMode()) and result = ""
68+
or
69+
result = strictconcat(string mode | mode = re.getAMode() | mode, " | ")
70+
}
71+
6472
override Regex getRegex() { result = re }
6573

6674
string getPrimaryQLClass() { result = "RegExpLiteral" }

0 commit comments

Comments
 (0)