Skip to content

Commit a343cea

Browse files
committed
add suspicious-regexp-range query
1 parent 0346b6b commit a343cea

File tree

29 files changed

+1736
-0
lines changed

29 files changed

+1736
-0
lines changed

config/identical-files.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,12 @@
506506
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",
507507
"ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll"
508508
],
509+
"SuspiciousRegexRange Python/JS/Ruby/Java": [
510+
"javascript/ql/lib/semmle/javascript/security/SuspiciousRegexpRangeQuery.qll",
511+
"python/ql/lib/semmle/python/security/SuspiciousRegexpRangeQuery.qll",
512+
"ruby/ql/lib/codeql/ruby/security/SuspiciousRegexpRangeQuery.qll",
513+
"java/ql/lib/semmle/code/java/security/SuspiciousRegexpRangeQuery.qll"
514+
],
509515
"CFG": [
510516
"csharp/ql/lib/semmle/code/csharp/controlflow/internal/ControlFlowGraphImplShared.qll",
511517
"ruby/ql/lib/codeql/ruby/controlflow/internal/ControlFlowGraphImplShared.qll",
Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,302 @@
1+
/**
2+
* Classes and predicates for working with suspicious character ranges.
3+
*/
4+
5+
// We don't need the NFA utils, just the regexp tree.
6+
// but the below is a nice shared library that exposes the API we need.
7+
import performance.ReDoSUtil
8+
9+
/**
10+
* Gets a rank for `range` that is unique for ranges in the same file.
11+
* Prioritizes ranges that match more characters.
12+
*/
13+
int rankRange(RegExpCharacterRange range) {
14+
range =
15+
rank[result](RegExpCharacterRange r, Location l, int low, int high |
16+
r.getLocation() = l and
17+
isRange(r, low, high)
18+
|
19+
r order by (high - low) desc, l.getStartLine(), l.getStartColumn()
20+
)
21+
}
22+
23+
/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */
24+
predicate isRange(RegExpCharacterRange range, int low, int high) {
25+
exists(string lowc, string highc |
26+
range.isRange(lowc, highc) and
27+
low.toUnicode() = lowc and
28+
high.toUnicode() = highc
29+
)
30+
}
31+
32+
/** Holds if `char` is an alpha-numeric character. */
33+
predicate isAlphanumeric(string char) {
34+
// written like this to avoid having a bindingset for the predicate
35+
char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z
36+
}
37+
38+
/**
39+
* Holds if the given ranges are from the same character class
40+
* and there exists at least one character matched by both ranges.
41+
*/
42+
predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) {
43+
exists(RegExpCharacterClass clz |
44+
a = clz.getAChild() and
45+
b = clz.getAChild()
46+
|
47+
// b contains the lower end of a
48+
exists(int alow, int ahigh, int blow, int bhigh |
49+
isRange(a, alow, ahigh) and
50+
isRange(b, blow, bhigh) and
51+
blow <= alow and
52+
bhigh >= ahigh
53+
)
54+
or
55+
// b contains the upper end of a
56+
exists(int blow, int bhigh, int alow, int ahigh |
57+
isRange(a, alow, ahigh) and
58+
isRange(b, blow, bhigh) and
59+
blow <= ahigh and
60+
bhigh >= ahigh
61+
)
62+
)
63+
or
64+
// symmetric overlap
65+
overlap(b, a)
66+
}
67+
68+
/**
69+
* Holds if `range` overlaps with the char class `escape` from the same character class.
70+
*/
71+
predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) {
72+
exists(RegExpCharacterClass clz, string low, string high |
73+
range = clz.getAChild() and
74+
escape = clz.getAChild() and
75+
range.isRange(low, high)
76+
|
77+
escape.getValue() = "w" and
78+
inRange(low, high).regexpMatch("\\w")
79+
or
80+
escape.getValue() = "d" and
81+
inRange(low, high).regexpMatch("\\d")
82+
or
83+
escape.getValue() = "s" and
84+
inRange(low, high).regexpMatch("\\s")
85+
)
86+
}
87+
88+
/** Gets the unicode code point for a `char`. */
89+
bindingset[char]
90+
int toCodePoint(string char) { result.toUnicode() = char }
91+
92+
/** A character range that appears to be overly wide. */
93+
class OverlyWideRange extends RegExpCharacterRange {
94+
OverlyWideRange() {
95+
exists(int low, int high, int numChars |
96+
isRange(this, low, high) and
97+
numChars = (1 + high - low) and
98+
this.getRootTerm().isUsedAsRegExp() and
99+
numChars >= 10
100+
|
101+
// across the Z-a range (which includes backticks)
102+
toCodePoint("Z") >= low and
103+
toCodePoint("a") <= high
104+
or
105+
// across the 9-A range (which includes e.g. ; and ?)
106+
toCodePoint("9") >= low and
107+
toCodePoint("A") <= high
108+
or
109+
// any non-alpha numeric as part of the range
110+
not isAlphanumeric([low, high].toUnicode())
111+
) and
112+
// some cases I want to exclude from being flagged
113+
not this = allowedWideRanges()
114+
}
115+
116+
/** Gets a string representation of a character class that matches the same chars as this range. */
117+
string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) }
118+
}
119+
120+
/** Gets a range that should not be reported as an overly wide range. */
121+
RegExpCharacterRange allowedWideRanges() {
122+
// ~ is the last printable ASCII character, it's used right in various wide ranges.
123+
result.isRange(_, "~")
124+
or
125+
// the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character.
126+
result.isRange([" ", "!"], _)
127+
or
128+
// I've seen this often enough, looks OK.
129+
result.isRange("@", "_")
130+
or
131+
// starting from the zero byte is a good indication that it's purposely matching a large range.
132+
result.isRange(0.toUnicode(), _)
133+
}
134+
135+
/** Gets all chars between (and including) `low` and `high`. */
136+
bindingset[low, high]
137+
private string inRange(string low, string high) {
138+
result = [toCodePoint(low) .. toCodePoint(high)].toUnicode()
139+
}
140+
141+
/** A module computing an equivalent character class for an overly wide range. */
142+
module RangePrinter {
143+
bindingset[char]
144+
private string next(string char) {
145+
exists(int prev, int next |
146+
prev.toUnicode() = char and
147+
next.toUnicode() = result and
148+
next = prev + 1
149+
)
150+
}
151+
152+
bindingset[char]
153+
private string prev(string char) {
154+
exists(int prev, int next |
155+
prev.toUnicode() = char and
156+
next.toUnicode() = result and
157+
next = prev - 1
158+
)
159+
}
160+
161+
/** Gets the points where the parts of the pretty printed range should be cut off. */
162+
private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] }
163+
164+
/** Gets the char to use in the low end of a range for a given `cut` */
165+
private string lowCut(string cut) {
166+
cut = ["A", "a", "0"] and
167+
result = cut
168+
or
169+
cut = ["Z", "z", "9"] and
170+
result = next(cut)
171+
}
172+
173+
/** Gets the char to use in the high end of a range for a given `cut` */
174+
private string highCut(string cut) {
175+
cut = ["Z", "z", "9"] and
176+
result = cut
177+
or
178+
cut = ["A", "a", "0"] and
179+
result = prev(cut)
180+
}
181+
182+
/** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */
183+
private string cutoff(OverlyWideRange range, int part) {
184+
exists(int low, int high | isRange(range, low, high) |
185+
result =
186+
rank[part + 1](string cut |
187+
cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high
188+
|
189+
cut order by toCodePoint(cut)
190+
)
191+
)
192+
}
193+
194+
/** Gets the number of parts we should print for a given `range`. */
195+
private int parts(OverlyWideRange range) { result = 1 + strictcount(cutoff(range, _)) }
196+
197+
/** Holds if the given part of a range should span from `low` to `high`. */
198+
private predicate part(OverlyWideRange range, int part, string low, string high) {
199+
// first part.
200+
part = 0 and
201+
(
202+
range.isRange(low, high) and
203+
parts(range) = 1
204+
or
205+
parts(range) >= 2 and
206+
range.isRange(low, _) and
207+
high = highCut(cutoff(range, part))
208+
)
209+
or
210+
// middle
211+
part >= 1 and
212+
part < parts(range) and
213+
low = lowCut(cutoff(range, part - 1)) and
214+
high = highCut(cutoff(range, part))
215+
or
216+
// last.
217+
part = parts(range) and
218+
low = lowCut(cutoff(range, part - 2)) and
219+
range.isRange(_, high)
220+
}
221+
222+
/** Gets an escaped `char` for use in a character class. */
223+
bindingset[char]
224+
private string escape(string char) {
225+
exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" |
226+
char.regexpMatch(reg) and
227+
result = "\\" + char
228+
or
229+
not char.regexpMatch(reg) and
230+
result = char
231+
)
232+
}
233+
234+
/** Gets a part of the equivalent range. */
235+
private string printEquivalentCharClass(OverlyWideRange range, int part) {
236+
exists(string low, string high | part(range, part, low, high) |
237+
if
238+
isAlphanumeric(low) and
239+
isAlphanumeric(high)
240+
then result = low + "-" + high
241+
else
242+
result = strictconcat(string char | char = inRange(low, high) | escape(char) order by char)
243+
)
244+
}
245+
246+
/** Gets the entire pretty printed equivalent range. */
247+
string printEquivalentCharClass(OverlyWideRange range) {
248+
result =
249+
"[" +
250+
strictconcat(string r, int part |
251+
r = printEquivalentCharClass(range, part)
252+
|
253+
r order by part
254+
) + "]"
255+
}
256+
}
257+
258+
/** Gets a char range that is suspiciously because of `reason`. */
259+
RegExpCharacterRange getABadRange(string reason, int priority) {
260+
priority = 0 and
261+
reason = "is equivalent to " + result.(OverlyWideRange).printEquivalent()
262+
or
263+
priority = 1 and
264+
exists(RegExpCharacterRange other |
265+
reason = "overlaps with " + other + " in the same character class" and
266+
result != other and
267+
rankRange(result) < rankRange(other) and
268+
overlap(result, other)
269+
)
270+
or
271+
priority = 2 and
272+
exists(RegExpCharacterClassEscape escape |
273+
reason = "overlaps with " + escape + " in the same character class" and
274+
overlapsWithCharEscape(result, escape)
275+
)
276+
or
277+
reason = "is empty" and
278+
priority = 3 and
279+
exists(int low, int high |
280+
isRange(result, low, high) and
281+
low > high
282+
)
283+
}
284+
285+
/** Holds if `range` matches suspiciously many characters. */
286+
predicate problem(RegExpCharacterRange range, string reason) {
287+
range = getABadRange(_, _) and
288+
reason =
289+
strictconcat(string m, int priority |
290+
range = getABadRange(m, priority)
291+
|
292+
m, ", and " order by priority desc
293+
) and
294+
// specifying a range using an escape is usually OK.
295+
not range.getAChild() instanceof RegExpEscape and
296+
// Unicode escapes in strings are interpreted before it turns into a regexp,
297+
// so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants.
298+
// We therefore exclude these ranges.
299+
range.getRootTerm().getParent() instanceof RegExpLiteral and
300+
// is used as regexp (mostly for JS where regular expressions are parsed eagerly)
301+
range.getRootTerm().isUsedAsRegExp()
302+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
A regexp range can by accident match more than was intended.
9+
For example, the regular expression <code>/[a-zA-z]/</code> will
10+
match every lowercase and uppercase letters, but the same regular
11+
expression will also match the chars: <code>[\]^_`</code>.
12+
</p>
13+
<p>
14+
On other occasions it can happen that the dash in a regular
15+
expression is not escaped, which will cause it to be interpreted
16+
as part of a range. For example in the character class <code>[a-zA-Z0-9%=.,-_]</code>
17+
the last character range matches the 55 characters between
18+
<code>,</code> and <code>_</code> (both included), which overlaps with the
19+
range <code>[0-9]</code> and is thus clearly not intended.
20+
</p>
21+
</overview>
22+
23+
<recommendation>
24+
<p>
25+
26+
Don't write character ranges were there might be confusion as to
27+
which characters are included in the range.
28+
29+
</p>
30+
</recommendation>
31+
32+
<example>
33+
34+
<p>
35+
The following example code checks whether a string is a valid 6 digit hex color.
36+
</p>
37+
38+
<sample language="java">
39+
import java.util.regex.Pattern
40+
public class Tester {
41+
public static boolean is_valid_hex_color(String color) {
42+
return Pattern.matches("#[0-9a-fA-f]{6}", color);
43+
}
44+
}
45+
</sample>
46+
47+
<p>
48+
However, the <code>A-f</code> range matches every uppercase character, and
49+
thus a "color" like <code>#XYZ</code> is considered valid.
50+
</p>
51+
52+
<p>
53+
The fix is to use an uppercase <code>A-F</code> range instead.
54+
</p>
55+
56+
<sample language="javascript">
57+
import java.util.regex.Pattern
58+
public class Tester {
59+
public static boolean is_valid_hex_color(String color) {
60+
return Pattern.matches("#[0-9a-fA-F]{6}", color);
61+
}
62+
}
63+
</sample>
64+
65+
</example>
66+
67+
<references>
68+
<li>Mitre.org: <a href="https://cwe.mitre.org/data/definitions/20.html">CWE-020</a></li>
69+
<li>github.com: <a href="https://github.com/advisories/GHSA-g4rg-993r-mgx7">CVE-2021-42740</a></li>
70+
<li>wh0.github.io: <a href="https://wh0.github.io/2021/10/28/shell-quote-rce-exploiting.html">Exploiting CVE-2021-42740</a></li>
71+
</references>
72+
</qhelp>

0 commit comments

Comments
 (0)