Skip to content

Commit c15ddf6

Browse files
committed
update ReDoSUtil in ruby
1 parent 2ddf445 commit c15ddf6

File tree

2 files changed

+138
-43
lines changed

2 files changed

+138
-43
lines changed

ruby/ql/lib/codeql/ruby/security/performance/ReDoSUtil.qll

Lines changed: 98 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -140,12 +140,10 @@ class RegExpRoot extends RegExpTerm {
140140
predicate isRelevant() {
141141
// there is at least one repetition
142142
getRoot(any(InfiniteRepetitionQuantifier q)) = this and
143-
// there are no lookbehinds
144-
not exists(RegExpLookbehind lbh | getRoot(lbh) = this) and
145143
// is actually used as a RegExp
146-
this.isUsedAsRegExp() //and
147-
// // pragmatic performance optimization: ignore minified files.
148-
// not getRootTerm().getParent().(Expr).getTopLevel().isMinified()
144+
isUsedAsRegExp() and
145+
// not excluded for library specific reasons
146+
not isExcluded(getRootTerm().getParent())
149147
}
150148
}
151149

@@ -156,38 +154,68 @@ private class RegexpCharacterConstant extends RegExpConstant {
156154
RegexpCharacterConstant() { this.isCharacter() }
157155
}
158156

157+
/**
158+
* A regexp term that is relevant for this ReDoS analysis.
159+
*/
160+
class RelevantRegExpTerm extends RegExpTerm {
161+
RelevantRegExpTerm() { getRoot(this).isRelevant() }
162+
}
163+
159164
/**
160165
* Holds if `term` is the chosen canonical representative for all terms with string representation `str`.
166+
* The string representation includes which flags are used with the regular expression.
161167
*
162168
* Using canonical representatives gives a huge performance boost when working with tuples containing multiple `InputSymbol`s.
163169
* The number of `InputSymbol`s is decreased by 3 orders of magnitude or more in some larger benchmarks.
164170
*/
165-
private predicate isCanonicalTerm(RegExpTerm term, string str) {
171+
private predicate isCanonicalTerm(RelevantRegExpTerm term, string str) {
166172
term =
167-
rank[1](RegExpTerm t, Location loc, File file |
173+
min(RelevantRegExpTerm t, Location loc, File file |
168174
loc = t.getLocation() and
169175
file = t.getFile() and
170-
str = t.getRawValue()
176+
str = t.getRawValue() + "|" + getCanonicalizationFlags(t.getRootTerm())
171177
|
172178
t order by t.getFile().getRelativePath(), loc.getStartLine(), loc.getStartColumn()
173179
)
174180
}
175181

182+
/**
183+
* Gets a string reperesentation of the flags used with the regular expression.
184+
* Only the flags that are relevant for the canonicalization are included.
185+
*/
186+
string getCanonicalizationFlags(RegExpTerm root) {
187+
root.isRootTerm() and
188+
(if RegExpFlags::isIgnoreCase(root) then result = "i" else result = "")
189+
}
190+
176191
/**
177192
* An abstract input symbol, representing a set of concrete characters.
178193
*/
179194
private newtype TInputSymbol =
180195
/** An input symbol corresponding to character `c`. */
181196
Char(string c) {
182-
c = any(RegexpCharacterConstant cc | getRoot(cc).isRelevant()).getValue().charAt(_)
197+
c =
198+
any(RegexpCharacterConstant cc |
199+
cc instanceof RelevantRegExpTerm and
200+
not RegExpFlags::isIgnoreCase(cc.getRootTerm())
201+
).getValue().charAt(_)
202+
or
203+
// normalize everything to lower case if the regexp is case insensitive
204+
c =
205+
any(RegexpCharacterConstant cc, string char |
206+
cc instanceof RelevantRegExpTerm and
207+
RegExpFlags::isIgnoreCase(cc.getRootTerm()) and
208+
char = cc.getValue().charAt(_)
209+
|
210+
char.toLowerCase()
211+
)
183212
} or
184213
/**
185214
* An input symbol representing all characters matched by
186215
* a (non-universal) character class that has string representation `charClassString`.
187216
*/
188217
CharClass(string charClassString) {
189-
exists(RegExpTerm term | term.getRawValue() = charClassString | getRoot(term).isRelevant()) and
190-
exists(RegExpTerm recc | isCanonicalTerm(recc, charClassString) |
218+
exists(RelevantRegExpTerm recc | isCanonicalTerm(recc, charClassString) |
191219
recc instanceof RegExpCharacterClass and
192220
not recc.(RegExpCharacterClass).isUniversalClass()
193221
or
@@ -254,7 +282,7 @@ class InputSymbol extends TInputSymbol {
254282
/**
255283
* An abstract input symbol that represents a character class.
256284
*/
257-
abstract private class CharacterClass extends InputSymbol {
285+
abstract class CharacterClass extends InputSymbol {
258286
/**
259287
* Gets a character that is relevant for intersection-tests involving this
260288
* character class.
@@ -277,7 +305,7 @@ abstract private class CharacterClass extends InputSymbol {
277305
/**
278306
* Gets a character matched by this character class.
279307
*/
280-
string choose() { result = this.getARelevantChar() and this.matches(result) }
308+
string choose() { result = getARelevantChar() and matches(result) }
281309
}
282310

283311
/**
@@ -289,6 +317,19 @@ private module CharacterClasses {
289317
*/
290318
pragma[noinline]
291319
predicate hasChildThatMatches(RegExpCharacterClass cc, string char) {
320+
if RegExpFlags::isIgnoreCase(cc.getRootTerm())
321+
then
322+
// normalize everything to lower case if the regexp is case insensitive
323+
exists(string c | hasChildThatMatchesIgnoringCasingFlags(cc, c) | char = c.toLowerCase())
324+
else hasChildThatMatchesIgnoringCasingFlags(cc, char)
325+
}
326+
327+
/**
328+
* Holds if the character class `cc` has a child (constant or range) that matches `char`.
329+
* Ignores whether the character class is inside a regular expression that has the ignore case flag.
330+
*/
331+
pragma[noinline]
332+
predicate hasChildThatMatchesIgnoringCasingFlags(RegExpCharacterClass cc, string char) {
292333
exists(getCanonicalCharClass(cc)) and
293334
exists(RegExpTerm child | child = cc.getAChild() |
294335
char = child.(RegexpCharacterConstant).getValue()
@@ -433,7 +474,7 @@ private module CharacterClasses {
433474
char = "0123456789".charAt(_)
434475
or
435476
clazz = "s" and
436-
char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f'
477+
char = [" ", "\t", "\r", "\n", 11.toUnicode(), 12.toUnicode()] // 11.toUnicode() = \v, 12.toUnicode() = \f
437478
or
438479
clazz = "w" and
439480
char = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_".charAt(_)
@@ -477,7 +518,7 @@ private module CharacterClasses {
477518
result = ["0", "9"]
478519
or
479520
cc.getValue() = "s" and
480-
result = [" "]
521+
result = " "
481522
or
482523
cc.getValue() = "w" and
483524
result = ["a", "Z", "_", "0", "9"]
@@ -490,7 +531,7 @@ private module CharacterClasses {
490531
result = "9"
491532
or
492533
cc.getValue() = "s" and
493-
result = [" "]
534+
result = " "
494535
or
495536
cc.getValue() = "w" and
496537
result = "a"
@@ -604,7 +645,7 @@ private State before(RegExpTerm t) { result = Match(t, 0) }
604645
/**
605646
* Gets a state the NFA may be in after matching `t`.
606647
*/
607-
private State after(RegExpTerm t) {
648+
State after(RegExpTerm t) {
608649
exists(RegExpAlt alt | t = alt.getAChild() | result = after(alt))
609650
or
610651
exists(RegExpSequence seq, int i | t = seq.getChild(i) |
@@ -633,7 +674,14 @@ private State after(RegExpTerm t) {
633674
predicate delta(State q1, EdgeLabel lbl, State q2) {
634675
exists(RegexpCharacterConstant s, int i |
635676
q1 = Match(s, i) and
636-
lbl = Char(s.getValue().charAt(i)) and
677+
(
678+
not RegExpFlags::isIgnoreCase(s.getRootTerm()) and
679+
lbl = Char(s.getValue().charAt(i))
680+
or
681+
// normalize everything to lower case if the regexp is case insensitive
682+
RegExpFlags::isIgnoreCase(s.getRootTerm()) and
683+
exists(string c | c = s.getValue().charAt(i) | lbl = Char(c.toLowerCase()))
684+
) and
637685
(
638686
q2 = Match(s, i + 1)
639687
or
@@ -643,20 +691,20 @@ predicate delta(State q1, EdgeLabel lbl, State q2) {
643691
)
644692
or
645693
exists(RegExpDot dot | q1 = before(dot) and q2 = after(dot) |
646-
if dot.getLiteral().isDotAll() then lbl = Any() else lbl = Dot()
694+
if RegExpFlags::isDotAll(dot.getRootTerm()) then lbl = Any() else lbl = Dot()
647695
)
648696
or
649697
exists(RegExpCharacterClass cc |
650698
cc.isUniversalClass() and q1 = before(cc) and lbl = Any() and q2 = after(cc)
651699
or
652700
q1 = before(cc) and
653-
lbl = CharClass(cc.getRawValue()) and
701+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
654702
q2 = after(cc)
655703
)
656704
or
657705
exists(RegExpCharacterClassEscape cc |
658706
q1 = before(cc) and
659-
lbl = CharClass(cc.getRawValue()) and
707+
lbl = CharClass(cc.getRawValue() + "|" + getCanonicalizationFlags(cc.getRootTerm())) and
660708
q2 = after(cc)
661709
)
662710
or
@@ -729,16 +777,27 @@ RegExpRoot getRoot(RegExpTerm term) {
729777
result = getRoot(term.getParent())
730778
}
731779

732-
private newtype TState =
733-
Match(RegExpTerm t, int i) {
734-
getRoot(t).isRelevant() and
735-
(
736-
i = 0
737-
or
738-
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
739-
)
780+
/**
781+
* A state in the NFA.
782+
*/
783+
newtype TState =
784+
/**
785+
* A state representing that the NFA is about to match a term.
786+
* `i` is used to index into multi-char literals.
787+
*/
788+
Match(RelevantRegExpTerm t, int i) {
789+
i = 0
790+
or
791+
exists(t.(RegexpCharacterConstant).getValue().charAt(i))
740792
} or
793+
/**
794+
* An accept state, where exactly the given input string is accepted.
795+
*/
741796
Accept(RegExpRoot l) { l.isRelevant() } or
797+
/**
798+
* An accept state, where the given input string, or any string that has this
799+
* string as a prefix, is accepted.
800+
*/
742801
AcceptAnySuffix(RegExpRoot l) { l.isRelevant() }
743802

744803
/**
@@ -851,29 +910,26 @@ InputSymbol getAnInputSymbolMatching(string char) {
851910
result = Any()
852911
}
853912

913+
/**
914+
* Holds if `state` is a start state.
915+
*/
916+
predicate isStartState(State state) {
917+
state = mkMatch(any(RegExpRoot r))
918+
or
919+
exists(RegExpCaret car | state = after(car))
920+
}
921+
854922
/**
855923
* Predicates for constructing a prefix string that leads to a given state.
856924
*/
857925
private module PrefixConstruction {
858-
/**
859-
* Holds if `state` starts the string matched by the regular expression.
860-
*/
861-
private predicate isStartState(State state) {
862-
state instanceof StateInPumpableRegexp and
863-
(
864-
state = Match(any(RegExpRoot r), _)
865-
or
866-
exists(RegExpCaret car | state = after(car))
867-
)
868-
}
869-
870926
/**
871927
* Holds if `state` is the textually last start state for the regular expression.
872928
*/
873929
private predicate lastStartState(State state) {
874930
exists(RegExpRoot root |
875931
state =
876-
max(State s, Location l |
932+
max(StateInPumpableRegexp s, Location l |
877933
isStartState(s) and getRoot(s.getRepr()) = root and l = s.getRepr().getLocation()
878934
|
879935
s
@@ -1173,7 +1229,6 @@ private predicate isReDoSAttackable(RegExpTerm term, string pump, State s) {
11731229
* `prefixMsg` contains a friendly message for a prefix that reaches `s` (or `prefixMsg` is the empty string if the prefix is empty or if no prefix could be found).
11741230
*/
11751231
predicate hasReDoSResult(RegExpTerm t, string pump, State s, string prefixMsg) {
1176-
not t.getRegExp().hasFreeSpacingFlag() and // exclude free-spacing mode regexes
11771232
isReDoSAttackable(t, pump, s) and
11781233
(
11791234
prefixMsg = "starting with '" + escape(PrefixConstruction::prefix(s)) + "' and " and

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,42 @@ private import codeql.ruby.ast.Literal as AST
22
private import codeql.Locations
33
private import ParseRegExp
44

5+
/**
6+
* Holds if the regular expression should not be considered.
7+
*/
8+
predicate isExcluded(RegExpParent parent) {
9+
parent.(RegExpTerm).getRegExp().hasFreeSpacingFlag() // exclude free-spacing mode regexes
10+
}
11+
12+
/**
13+
* A module containing predicates for determining which flags a regular expression have.
14+
*/
15+
module RegExpFlags {
16+
/**
17+
* Holds if `root` has the `i` flag for case-insensitive matching.
18+
*/
19+
predicate isIgnoreCase(RegExpTerm root) {
20+
root.isRootTerm() and
21+
root.getLiteral().isIgnoreCase()
22+
}
23+
24+
/**
25+
* Gets the flags for `root`, or the empty string if `root` has no flags.
26+
*/
27+
string getFlags(RegExpTerm root) {
28+
root.isRootTerm() and
29+
result = root.getLiteral().getFlags()
30+
}
31+
32+
/**
33+
* Holds if `root` has the `s` flag for multi-line matching.
34+
*/
35+
predicate isDotAll(RegExpTerm root) {
36+
root.isRootTerm() and
37+
root.getLiteral().isDotAll()
38+
}
39+
}
40+
541
/**
642
* An element containing a regular expression term, that is, either
743
* a string literal (parsed as a regular expression)
@@ -38,6 +74,10 @@ class RegExpLiteral extends TRegExpLiteral, RegExpParent {
3874

3975
predicate isDotAll() { re.hasMultilineFlag() }
4076

77+
predicate isIgnoreCase() { re.hasCaseInsensitiveFlag() }
78+
79+
string getFlags() { result = re.getFlagString() }
80+
4181
override string getAPrimaryQlClass() { result = "RegExpLiteral" }
4282
}
4383

0 commit comments

Comments
 (0)