Skip to content

Commit dd525a4

Browse files
authored
Merge pull request github#11061 from erik-krogh/shared-redosMod
ReDoS: add a shared regex pack
2 parents d19bde8 + f5daee2 commit dd525a4

11 files changed

+3294
-34
lines changed

config/identical-files.json

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -486,40 +486,6 @@
486486
"python/ql/lib/semmle/python/security/internal/SensitiveDataHeuristics.qll",
487487
"ruby/ql/lib/codeql/ruby/security/internal/SensitiveDataHeuristics.qll"
488488
],
489-
"ReDoS Util Python/JS/Ruby/Java": [
490-
"javascript/ql/lib/semmle/javascript/security/regexp/NfaUtils.qll",
491-
"python/ql/lib/semmle/python/security/regexp/NfaUtils.qll",
492-
"ruby/ql/lib/codeql/ruby/security/regexp/NfaUtils.qll",
493-
"java/ql/lib/semmle/code/java/security/regexp/NfaUtils.qll"
494-
],
495-
"ReDoS Exponential Python/JS/Ruby/Java": [
496-
"javascript/ql/lib/semmle/javascript/security/regexp/ExponentialBackTracking.qll",
497-
"python/ql/lib/semmle/python/security/regexp/ExponentialBackTracking.qll",
498-
"ruby/ql/lib/codeql/ruby/security/regexp/ExponentialBackTracking.qll",
499-
"java/ql/lib/semmle/code/java/security/regexp/ExponentialBackTracking.qll"
500-
],
501-
"ReDoS Polynomial Python/JS/Ruby/Java": [
502-
"javascript/ql/lib/semmle/javascript/security/regexp/SuperlinearBackTracking.qll",
503-
"python/ql/lib/semmle/python/security/regexp/SuperlinearBackTracking.qll",
504-
"ruby/ql/lib/codeql/ruby/security/regexp/SuperlinearBackTracking.qll",
505-
"java/ql/lib/semmle/code/java/security/regexp/SuperlinearBackTracking.qll"
506-
],
507-
"RegexpMatching Python/JS/Ruby": [
508-
"javascript/ql/lib/semmle/javascript/security/regexp/RegexpMatching.qll",
509-
"python/ql/lib/semmle/python/security/regexp/RegexpMatching.qll",
510-
"ruby/ql/lib/codeql/ruby/security/regexp/RegexpMatching.qll"
511-
],
512-
"BadTagFilterQuery Python/JS/Ruby": [
513-
"javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
514-
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",
515-
"ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll"
516-
],
517-
"OverlyLargeRange Python/JS/Ruby/Java": [
518-
"javascript/ql/lib/semmle/javascript/security/OverlyLargeRangeQuery.qll",
519-
"python/ql/lib/semmle/python/security/OverlyLargeRangeQuery.qll",
520-
"ruby/ql/lib/codeql/ruby/security/OverlyLargeRangeQuery.qll",
521-
"java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll"
522-
],
523489
"CFG": [
524490
"csharp/ql/lib/semmle/code/csharp/controlflow/internal/ControlFlowGraphImplShared.qll",
525491
"ruby/ql/lib/codeql/ruby/controlflow/internal/ControlFlowGraphImplShared.qll",
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* Initial release. Extracted common regex related code, including the ReDoS analysis, into a library pack to share code between languages.

shared/regex/codeql-pack.lock.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
dependencies: {}
3+
compiled: false
4+
lockVersion: 1.0.0
Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
/**
2+
* Classes and predicates for working with suspicious character ranges.
3+
*/
4+
5+
private import RegexTreeView
6+
7+
/**
8+
* Classes and predicates implementing an analysis detecting suspicious character ranges.
9+
*/
10+
module Make<RegexTreeViewSig TreeImpl> {
11+
private import TreeImpl
12+
13+
/**
14+
* Gets a rank for `range` that is unique for ranges in the same file.
15+
* Prioritizes ranges that match more characters.
16+
*/
17+
int rankRange(RegExpCharacterRange range) {
18+
range =
19+
rank[result](RegExpCharacterRange r, int startline, int startcolumn, int low, int high |
20+
r.hasLocationInfo(_, startline, startcolumn, _, _) and
21+
isRange(r, low, high)
22+
|
23+
r order by (high - low) desc, startline, startcolumn
24+
)
25+
}
26+
27+
/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */
28+
predicate isRange(RegExpCharacterRange range, int low, int high) {
29+
exists(string lowc, string highc |
30+
range.isRange(lowc, highc) and
31+
low.toUnicode() = lowc and
32+
high.toUnicode() = highc
33+
)
34+
}
35+
36+
/** Holds if `char` is an alpha-numeric character. */
37+
predicate isAlphanumeric(string char) {
38+
// written like this to avoid having a bindingset for the predicate
39+
char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z
40+
}
41+
42+
/**
43+
* Holds if the given ranges are from the same character class
44+
* and there exists at least one character matched by both ranges.
45+
*/
46+
predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) {
47+
exists(RegExpCharacterClass clz |
48+
a = clz.getAChild() and
49+
b = clz.getAChild() and
50+
a != b
51+
|
52+
exists(int alow, int ahigh, int blow, int bhigh |
53+
isRange(a, alow, ahigh) and
54+
isRange(b, blow, bhigh) and
55+
alow <= bhigh and
56+
blow <= ahigh
57+
)
58+
)
59+
}
60+
61+
/**
62+
* Holds if `range` overlaps with the char class `escape` from the same character class.
63+
*/
64+
predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) {
65+
exists(RegExpCharacterClass clz, string low, string high |
66+
range = clz.getAChild() and
67+
escape = clz.getAChild() and
68+
range.isRange(low, high)
69+
|
70+
escape.getValue() = "w" and
71+
getInRange(low, high).regexpMatch("\\w")
72+
or
73+
escape.getValue() = "d" and
74+
getInRange(low, high).regexpMatch("\\d")
75+
or
76+
escape.getValue() = "s" and
77+
getInRange(low, high).regexpMatch("\\s")
78+
)
79+
}
80+
81+
/** Gets the unicode code point for a `char`. */
82+
bindingset[char]
83+
int toCodePoint(string char) { result.toUnicode() = char }
84+
85+
/** A character range that appears to be overly wide. */
86+
class OverlyWideRange instanceof RegExpCharacterRange {
87+
OverlyWideRange() {
88+
exists(int low, int high, int numChars |
89+
isRange(this, low, high) and
90+
numChars = (1 + high - low) and
91+
this.getRootTerm().isUsedAsRegExp() and
92+
numChars >= 10
93+
|
94+
// across the Z-a range (which includes backticks)
95+
toCodePoint("Z") >= low and
96+
toCodePoint("a") <= high
97+
or
98+
// across the 9-A range (which includes e.g. ; and ?)
99+
toCodePoint("9") >= low and
100+
toCodePoint("A") <= high
101+
or
102+
// a non-alphanumeric char as part of the range boundaries
103+
exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and
104+
// while still being ascii
105+
low < 128 and
106+
high < 128
107+
) and
108+
// allowlist for known ranges
109+
not this = allowedWideRanges()
110+
}
111+
112+
/** Gets a string representation of a character class that matches the same chars as this range. */
113+
string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) }
114+
115+
/** Gets a string representation of this range. */
116+
string toString() { result = super.toString() }
117+
118+
/** Holds if `lo` is the lower bound of this character range and `hi` the upper bound. */
119+
predicate isRange(string lo, string hi) { super.isRange(lo, hi) }
120+
}
121+
122+
/** Gets a range that should not be reported as an overly wide range. */
123+
RegExpCharacterRange allowedWideRanges() {
124+
// ~ is the last printable ASCII character, it's used right in various wide ranges.
125+
result.isRange(_, "~")
126+
or
127+
// the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character.
128+
result.isRange([" ", "!"], _)
129+
or
130+
// the `[@-_]` range is intentional
131+
result.isRange("@", "_")
132+
or
133+
// starting from the zero byte is a good indication that it's purposely matching a large range.
134+
result.isRange(0.toUnicode(), _)
135+
}
136+
137+
/** Gets a char between (and including) `low` and `high`. */
138+
bindingset[low, high]
139+
private string getInRange(string low, string high) {
140+
result = [toCodePoint(low) .. toCodePoint(high)].toUnicode()
141+
}
142+
143+
/** A module computing an equivalent character class for an overly wide range. */
144+
module RangePrinter {
145+
bindingset[char]
146+
bindingset[result]
147+
private string next(string char) {
148+
exists(int prev, int next |
149+
prev.toUnicode() = char and
150+
next.toUnicode() = result and
151+
next = prev + 1
152+
)
153+
}
154+
155+
/** Gets the points where the parts of the pretty printed range should be cut off. */
156+
private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] }
157+
158+
/** Gets the char to use in the low end of a range for a given `cut` */
159+
private string lowCut(string cut) {
160+
cut = ["A", "a", "0"] and
161+
result = cut
162+
or
163+
cut = ["Z", "z", "9"] and
164+
result = next(cut)
165+
}
166+
167+
/** Gets the char to use in the high end of a range for a given `cut` */
168+
private string highCut(string cut) {
169+
cut = ["Z", "z", "9"] and
170+
result = cut
171+
or
172+
cut = ["A", "a", "0"] and
173+
next(result) = cut
174+
}
175+
176+
/** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */
177+
private string cutoff(OverlyWideRange range, int part) {
178+
exists(int low, int high | isRange(range, low, high) |
179+
result =
180+
rank[part + 1](string cut |
181+
cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high
182+
|
183+
cut order by toCodePoint(cut)
184+
)
185+
)
186+
}
187+
188+
/** Gets the number of parts we should print for a given `range`. */
189+
private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) }
190+
191+
/** Holds if the given part of a range should span from `low` to `high`. */
192+
private predicate part(OverlyWideRange range, int part, string low, string high) {
193+
// first part.
194+
part = 0 and
195+
(
196+
range.isRange(low, high) and
197+
parts(range) = 1
198+
or
199+
parts(range) >= 2 and
200+
range.isRange(low, _) and
201+
high = highCut(cutoff(range, part))
202+
)
203+
or
204+
// middle
205+
part >= 1 and
206+
part < parts(range) - 1 and
207+
low = lowCut(cutoff(range, part - 1)) and
208+
high = highCut(cutoff(range, part))
209+
or
210+
// last.
211+
part = parts(range) - 1 and
212+
low = lowCut(cutoff(range, part - 1)) and
213+
range.isRange(_, high)
214+
}
215+
216+
/** Gets an escaped `char` for use in a character class. */
217+
bindingset[char]
218+
private string escape(string char) {
219+
exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" |
220+
if char.regexpMatch(reg) then result = "\\" + char else result = char
221+
)
222+
}
223+
224+
/** Gets a part of the equivalent range. */
225+
private string printEquivalentCharClass(OverlyWideRange range, int part) {
226+
exists(string low, string high | part(range, part, low, high) |
227+
if
228+
isAlphanumeric(low) and
229+
isAlphanumeric(high)
230+
then result = low + "-" + high
231+
else
232+
result =
233+
strictconcat(string char | char = getInRange(low, high) | escape(char) order by char)
234+
)
235+
}
236+
237+
/** Gets the entire pretty printed equivalent range. */
238+
string printEquivalentCharClass(OverlyWideRange range) {
239+
result =
240+
strictconcat(string r, int part |
241+
r = "[" and part = -1 and exists(range)
242+
or
243+
r = printEquivalentCharClass(range, part)
244+
or
245+
r = "]" and part = parts(range)
246+
|
247+
r order by part
248+
)
249+
}
250+
}
251+
252+
/** Gets a char range that is overly large because of `reason`. */
253+
RegExpCharacterRange getABadRange(string reason, int priority) {
254+
result instanceof OverlyWideRange and
255+
priority = 0 and
256+
exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() |
257+
if equiv.length() <= 50
258+
then reason = "is equivalent to " + equiv
259+
else reason = "is equivalent to " + equiv.substring(0, 50) + "..."
260+
)
261+
or
262+
priority = 1 and
263+
exists(RegExpCharacterRange other |
264+
reason = "overlaps with " + other + " in the same character class" and
265+
rankRange(result) < rankRange(other) and
266+
overlap(result, other)
267+
)
268+
or
269+
priority = 2 and
270+
exists(RegExpCharacterClassEscape escape |
271+
reason = "overlaps with " + escape + " in the same character class" and
272+
overlapsWithCharEscape(result, escape)
273+
)
274+
or
275+
reason = "is empty" and
276+
priority = 3 and
277+
exists(int low, int high |
278+
isRange(result, low, high) and
279+
low > high
280+
)
281+
}
282+
283+
/** Holds if `range` matches suspiciously many characters. */
284+
predicate problem(RegExpCharacterRange range, string reason) {
285+
reason =
286+
strictconcat(string m, int priority |
287+
range = getABadRange(m, priority)
288+
|
289+
m, ", and " order by priority desc
290+
) and
291+
// specifying a range using an escape is usually OK.
292+
not range.getAChild() instanceof RegExpEscape and
293+
// Unicode escapes in strings are interpreted before it turns into a regexp,
294+
// so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants.
295+
// We therefore exclude these ranges.
296+
range.getRootTerm().getParent() instanceof RegExpLiteral and
297+
// is used as regexp (mostly for JS where regular expressions are parsed eagerly)
298+
range.getRootTerm().isUsedAsRegExp()
299+
}
300+
}

0 commit comments

Comments
 (0)