Skip to content

Commit b4963c7

Browse files
authored
Merge pull request github#6558 from erik-krogh/redosCasing
Approved by esbena, yoff
2 parents e4fd749 + 1ad204d commit b4963c7

File tree

9 files changed

+213
-16
lines changed

9 files changed

+213
-16
lines changed

config/identical-files.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,4 +462,4 @@
462462
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
463463
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll"
464464
]
465-
}
465+
}

javascript/ql/lib/semmle/javascript/security/performance/ReDoSUtil.qll

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
164164

165165
/**
166166
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
167+
* The string representation includes which flags are used with the regular expression.
167168
*
168169
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
169170
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,48 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
173174
min(RelevantRegExpTerm t, Location loc, File file |
174175
loc = t.getLocation() and
175176
file = t.getFile() and
176-
str = t.getRawValue()
177+
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
177178
|
178179
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
179180
)
180181
}
181182

183+
/**
184+
* Gets a string reperesentation of the flags used with the regular expression.
185+
* Only the flags that are relevant for the canonicalization are included.
186+
*/
187+
string getCanonicalizationFlags(RegExpTerm root) {
188+
root.isRootTerm() and
189+
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
190+
}
191+
182192
/**
183193
* An abstract input symbol, representing a set of concrete characters.
184194
*/
185195
private newtype TInputSymbol =
186196
/** An input symbol corresponding to character `c`. */
187197
Char(string c) {
188-
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
198+
c =
199+
any(RegexpCharacterConstant cc |
200+
cc instanceof RelevantRegExpTerm and
201+
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
202+
).getValue().charAt(_)
203+
or
204+
// normalize everything to lower case if the regexp is case insensitive
205+
c =
206+
any(RegexpCharacterConstant cc, string char |
207+
cc instanceof RelevantRegExpTerm and
208+
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
209+
char = cc.getValue().charAt(_)
210+
|
211+
char.toLowerCase()
212+
)
189213
} or
190214
/**
191215
* An input symbol representing all characters matched by
192216
* a (non-universal) character class that has string representation `charClassString`.
193217
*/
194218
CharClass(string charClassString) {
195-
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
196219
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
197220
recc instanceof RegExpCharacterClass and
198221
not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +316,19 @@ private module CharacterClasses {
293316
*/
294317
pragma[noinline]
295318
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
319+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
320+
then
321+
// normalize everything to lower case if the regexp is case insensitive
322+
exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase())
323+
else hasChildThatMatchesIgnoringCasingFlags(cc, char)
324+
}
325+
326+
/**
327+
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
328+
* Ignores whether the character class is inside a regular expression that has the ignore case flag.
329+
*/
330+
pragma[noinline]
331+
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
296332
exists(getCanonicalCharClass(cc)) and
297333
exists(RegExpTerm child | child = cc.getAChild() |
298334
char = child.(RegexpCharacterConstant).getValue()
@@ -537,7 +573,14 @@ private State after(RegExpTerm t) {
537573
predicate delta(State q1, EdgeLabel lbl, State q2) {
538574
exists(RegexpCharacterConstant s, int i |
539575
q1 = Match(s, i) and
540-
lbl = Char(s.getValue().charAt(i)) and
576+
(
577+
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
578+
lbl = Char(s.getValue().charAt(i))
579+
or
580+
// normalize everything to lower case if the regexp is case insensitive
581+
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
582+
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
583+
) and
541584
(
542585
q2 = Match(s, i + 1)
543586
or
@@ -547,20 +590,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
547590
)
548591
or
549592
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
550-
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
593+
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
551594
)
552595
or
553596
exists(RegExpCharacterClass cc |
554597
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
555598
or
556599
q1 = before(cc) and
557-
lbl = CharClass(cc.getRawValue()) and
600+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
558601
q2 = after(cc)
559602
)
560603
or
561604
exists(RegExpCharacterClassEscape cc |
562605
q1 = before(cc) and
563-
lbl = CharClass(cc.getRawValue()) and
606+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
564607
q2 = after(cc)
565608
)
566609
or
@@ -627,13 +670,27 @@ RegExpRoot getRoot(RegExpTerm term) {
627670
result = getRoot(term.getParent())
628671
}
629672

673+
/**
674+
* A state in the NFA.
675+
*/
630676
private newtype TState =
677+
/**
678+
* A state representing that the NFA is about to match a term.
679+
* `i` is used to index into multi-char literals.
680+
*/
631681
Match(RelevantRegExpTerm t, int i) {
632682
i = 0
633683
or
634684
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
635685
} or
686+
/**
687+
* An accept state, where exactly the given input string is accepted.
688+
*/
636689
Accept(RegExpRoot l) { l.isRelevant() } or
690+
/**
691+
* An accept state, where the given input string, or any string that has this
692+
* string as a prefix, is accepted.
693+
*/
637694
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
638695

639696
/**

javascript/ql/lib/semmle/javascript/security/performance/RegExpTreeView.qll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,41 @@ import javascript
1212
* For javascript we make the pragmatic performance optimization to ignore minified files.
1313
*/
1414
predicate isExcluded(RegExpParent parent) { parent.(Expr).getTopLevel().isMinified() }
15+
16+
/**
17+
* A module containing predicates for determining which flags a regular expression have.
18+
*/
19+
module RegExpFlags {
20+
/**
21+
* Holds if `root` has the `i` flag for case-insensitive matching.
22+
*/
23+
predicate isIgnoreCase(RegExpTerm root) {
24+
root.isRootTerm() and
25+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
26+
RegExp::isIgnoreCase(node.getFlags())
27+
)
28+
}
29+
30+
/**
31+
* Gets the flags for `root`, or the empty string if `root` has no flags.
32+
*/
33+
string getFlags(RegExpTerm root) {
34+
root.isRootTerm() and
35+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
36+
result = node.getFlags()
37+
or
38+
not exists(node.getFlags()) and
39+
result = ""
40+
)
41+
}
42+
43+
/**
44+
* Holds if `root` has the `s` flag for multi-line matching.
45+
*/
46+
predicate isDotAll(RegExpTerm root) {
47+
root.isRootTerm() and
48+
exists(DataFlow::RegExpCreationNode node | node.getRoot() = root |
49+
RegExp::isDotAll(node.getFlags())
50+
)
51+
}
52+
}

javascript/ql/test/query-tests/Performance/ReDoS/PolynomialBackTracking.expected

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,7 @@
503503
| tst.js:375:15:375:16 | x* | Strings with many repetitions of 'x' can start matching anywhere after the start of the preceeding (x*)+(?=$\|y) |
504504
| tst.js:378:16:378:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$) |
505505
| tst.js:379:16:379:22 | [\\s\\S]* | Strings with many repetitions of 'a' can start matching anywhere after the start of the preceeding ([\\s\\S]*)+(?=$\|y) |
506+
| tst.js:381:15:381:24 | (foo\|FOO)* | Strings with many repetitions of 'FOO' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
507+
| tst.js:382:14:382:23 | (foo\|FOO)* | Strings with many repetitions of 'foo' can start matching anywhere after the start of the preceeding (foo\|FOO)*bar |
508+
| tst.js:384:15:384:26 | ([AB]\|[ab])* | Strings with many repetitions of 'A' can start matching anywhere after the start of the preceeding ([AB]\|[ab])*C |
509+
| tst.js:385:14:385:25 | ([DE]\|[de])* | Strings with many repetitions of 'd' can start matching anywhere after the start of the preceeding ([DE]\|[de])*F |

javascript/ql/test/query-tests/Performance/ReDoS/ReDoS.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,5 @@
178178
| tst.js:375:15:375:16 | x* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'x'. |
179179
| tst.js:378:16:378:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
180180
| tst.js:379:16:379:22 | [\\s\\S]* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'a'. |
181+
| tst.js:382:14:382:23 | (foo\|FOO)* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'foo'. |
182+
| tst.js:385:14:385:25 | ([DE]\|[de])* | This part of the regular expression may cause exponential backtracking on strings containing many repetitions of 'd'. |

javascript/ql/test/query-tests/Performance/ReDoS/tst.js

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,4 +376,10 @@ var bad90 = /(x*)+(?=$|y)/
376376

377377
// GOOD - but we spuriously conclude that a rejecting suffix exists.
378378
var good44 = /([\s\S]*)+(?=$)/;
379-
var good45 = /([\s\S]*)+(?=$|y)/;
379+
var good45 = /([\s\S]*)+(?=$|y)/;
380+
381+
var good46 = /(foo|FOO)*bar/;
382+
var bad91 = /(foo|FOO)*bar/i;
383+
384+
var good47 = /([AB]|[ab])*C/;
385+
var bad92 = /([DE]|[de])*F/i;

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
6161

6262
predicate isDotAll() { re.getAMode() = "DOTALL" }
6363

64+
predicate isIgnoreCase() { re.getAMode() = "IGNORECASE" }
65+
66+
string getFlags() { result = concat(string mode | mode = re.getAMode() | mode, " | ") }
67+
6468
override Regex getRegex() { result = re }
6569

6670
string getPrimaryQLClass() { result = "RegExpLiteral" }

python/ql/lib/semmle/python/security/performance/ReDoSUtil.qll

Lines changed: 64 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ class RelevantRegExpTerm extends RegExpTerm {
164164

165165
/**
166166
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
167+
* The string representation includes which flags are used with the regular expression.
167168
*
168169
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
169170
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
@@ -173,26 +174,48 @@ private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
173174
min(RelevantRegExpTerm t, Location loc, File file |
174175
loc = t.getLocation() and
175176
file = t.getFile() and
176-
str = t.getRawValue()
177+
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
177178
|
178179
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
179180
)
180181
}
181182

183+
/**
184+
* Gets a string reperesentation of the flags used with the regular expression.
185+
* Only the flags that are relevant for the canonicalization are included.
186+
*/
187+
string getCanonicalizationFlags(RegExpTerm root) {
188+
root.isRootTerm() and
189+
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
190+
}
191+
182192
/**
183193
* An abstract input symbol, representing a set of concrete characters.
184194
*/
185195
private newtype TInputSymbol =
186196
/** An input symbol corresponding to character `c`. */
187197
Char(string c) {
188-
c = any(RegexpCharacterConstant cc | cc instanceof RelevantRegExpTerm).getValue().charAt(_)
198+
c =
199+
any(RegexpCharacterConstant cc |
200+
cc instanceof RelevantRegExpTerm and
201+
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
202+
).getValue().charAt(_)
203+
or
204+
// normalize everything to lower case if the regexp is case insensitive
205+
c =
206+
any(RegexpCharacterConstant cc, string char |
207+
cc instanceof RelevantRegExpTerm and
208+
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
209+
char = cc.getValue().charAt(_)
210+
|
211+
char.toLowerCase()
212+
)
189213
} or
190214
/**
191215
* An input symbol representing all characters matched by
192216
* a (non-universal) character class that has string representation `charClassString`.
193217
*/
194218
CharClass(string charClassString) {
195-
exists(RelevantRegExpTerm term | term.getRawValue() = charClassString) and
196219
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
197220
recc instanceof RegExpCharacterClass and
198221
not recc.(RegExpCharacterClass).isUniversalClass()
@@ -293,6 +316,19 @@ private module CharacterClasses {
293316
*/
294317
pragma[noinline]
295318
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
319+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
320+
then
321+
// normalize everything to lower case if the regexp is case insensitive
322+
exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase())
323+
else hasChildThatMatchesIgnoringCasingFlags(cc, char)
324+
}
325+
326+
/**
327+
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
328+
* Ignores whether the character class is inside a regular expression that has the ignore case flag.
329+
*/
330+
pragma[noinline]
331+
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
296332
exists(getCanonicalCharClass(cc)) and
297333
exists(RegExpTerm child | child = cc.getAChild() |
298334
char = child.(RegexpCharacterConstant).getValue()
@@ -537,7 +573,14 @@ private State after(RegExpTerm t) {
537573
predicate delta(State q1, EdgeLabel lbl, State q2) {
538574
exists(RegexpCharacterConstant s, int i |
539575
q1 = Match(s, i) and
540-
lbl = Char(s.getValue().charAt(i)) and
576+
(
577+
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
578+
lbl = Char(s.getValue().charAt(i))
579+
or
580+
// normalize everything to lower case if the regexp is case insensitive
581+
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
582+
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
583+
) and
541584
(
542585
q2 = Match(s, i + 1)
543586
or
@@ -547,20 +590,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
547590
)
548591
or
549592
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
550-
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
593+
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
551594
)
552595
or
553596
exists(RegExpCharacterClass cc |
554597
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
555598
or
556599
q1 = before(cc) and
557-
lbl = CharClass(cc.getRawValue()) and
600+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
558601
q2 = after(cc)
559602
)
560603
or
561604
exists(RegExpCharacterClassEscape cc |
562605
q1 = before(cc) and
563-
lbl = CharClass(cc.getRawValue()) and
606+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
564607
q2 = after(cc)
565608
)
566609
or
@@ -627,13 +670,27 @@ RegExpRoot getRoot(RegExpTerm term) {
627670
result = getRoot(term.getParent())
628671
}
629672

673+
/**
674+
* A state in the NFA.
675+
*/
630676
private newtype TState =
677+
/**
678+
* A state representing that the NFA is about to match a term.
679+
* `i` is used to index into multi-char literals.
680+
*/
631681
Match(RelevantRegExpTerm t, int i) {
632682
i = 0
633683
or
634684
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
635685
} or
686+
/**
687+
* An accept state, where exactly the given input string is accepted.
688+
*/
636689
Accept(RegExpRoot l) { l.isRelevant() } or
690+
/**
691+
* An accept state, where the given input string, or any string that has this
692+
* string as a prefix, is accepted.
693+
*/
637694
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
638695

639696
/**

0 commit comments

Comments
 (0)