Skip to content

Commit c029048

Browse files
committed
port the Java regex/redos queries to use the shared pack
1 parent d5b0666 commit c029048

File tree

12 files changed

+49
-2376
lines changed

12 files changed

+49
-2376
lines changed

java/ql/lib/semmle/code/java/security/OverlyLargeRangeQuery.qll

Lines changed: 4 additions & 285 deletions
Original file line numberDiff line numberDiff line change
@@ -2,288 +2,7 @@
22
* Classes and predicates for working with suspicious character ranges.
33
*/
44

5-
// We don't need the NFA utils, just the regexp tree.
6-
// but the below is a nice shared library that exposes the API we need.
7-
import regexp.NfaUtils
8-
9-
/**
10-
* Gets a rank for `range` that is unique for ranges in the same file.
11-
* Prioritizes ranges that match more characters.
12-
*/
13-
int rankRange(RegExpCharacterRange range) {
14-
range =
15-
rank[result](RegExpCharacterRange r, Location l, int low, int high |
16-
r.getLocation() = l and
17-
isRange(r, low, high)
18-
|
19-
r order by (high - low) desc, l.getStartLine(), l.getStartColumn()
20-
)
21-
}
22-
23-
/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */
24-
predicate isRange(RegExpCharacterRange range, int low, int high) {
25-
exists(string lowc, string highc |
26-
range.isRange(lowc, highc) and
27-
low.toUnicode() = lowc and
28-
high.toUnicode() = highc
29-
)
30-
}
31-
32-
/** Holds if `char` is an alpha-numeric character. */
33-
predicate isAlphanumeric(string char) {
34-
// written like this to avoid having a bindingset for the predicate
35-
char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z
36-
}
37-
38-
/**
39-
* Holds if the given ranges are from the same character class
40-
* and there exists at least one character matched by both ranges.
41-
*/
42-
predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) {
43-
exists(RegExpCharacterClass clz |
44-
a = clz.getAChild() and
45-
b = clz.getAChild() and
46-
a != b
47-
|
48-
exists(int alow, int ahigh, int blow, int bhigh |
49-
isRange(a, alow, ahigh) and
50-
isRange(b, blow, bhigh) and
51-
alow <= bhigh and
52-
blow <= ahigh
53-
)
54-
)
55-
}
56-
57-
/**
58-
* Holds if `range` overlaps with the char class `escape` from the same character class.
59-
*/
60-
predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) {
61-
exists(RegExpCharacterClass clz, string low, string high |
62-
range = clz.getAChild() and
63-
escape = clz.getAChild() and
64-
range.isRange(low, high)
65-
|
66-
escape.getValue() = "w" and
67-
getInRange(low, high).regexpMatch("\\w")
68-
or
69-
escape.getValue() = "d" and
70-
getInRange(low, high).regexpMatch("\\d")
71-
or
72-
escape.getValue() = "s" and
73-
getInRange(low, high).regexpMatch("\\s")
74-
)
75-
}
76-
77-
/** Gets the unicode code point for a `char`. */
78-
bindingset[char]
79-
int toCodePoint(string char) { result.toUnicode() = char }
80-
81-
/** A character range that appears to be overly wide. */
82-
class OverlyWideRange extends RegExpCharacterRange {
83-
OverlyWideRange() {
84-
exists(int low, int high, int numChars |
85-
isRange(this, low, high) and
86-
numChars = (1 + high - low) and
87-
this.getRootTerm().isUsedAsRegExp() and
88-
numChars >= 10
89-
|
90-
// across the Z-a range (which includes backticks)
91-
toCodePoint("Z") >= low and
92-
toCodePoint("a") <= high
93-
or
94-
// across the 9-A range (which includes e.g. ; and ?)
95-
toCodePoint("9") >= low and
96-
toCodePoint("A") <= high
97-
or
98-
// a non-alphanumeric char as part of the range boundaries
99-
exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and
100-
// while still being ascii
101-
low < 128 and
102-
high < 128
103-
) and
104-
// allowlist for known ranges
105-
not this = allowedWideRanges()
106-
}
107-
108-
/** Gets a string representation of a character class that matches the same chars as this range. */
109-
string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) }
110-
}
111-
112-
/** Gets a range that should not be reported as an overly wide range. */
113-
RegExpCharacterRange allowedWideRanges() {
114-
// ~ is the last printable ASCII character, it's used right in various wide ranges.
115-
result.isRange(_, "~")
116-
or
117-
// the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character.
118-
result.isRange([" ", "!"], _)
119-
or
120-
// the `[@-_]` range is intentional
121-
result.isRange("@", "_")
122-
or
123-
// starting from the zero byte is a good indication that it's purposely matching a large range.
124-
result.isRange(0.toUnicode(), _)
125-
}
126-
127-
/** Gets a char between (and including) `low` and `high`. */
128-
bindingset[low, high]
129-
private string getInRange(string low, string high) {
130-
result = [toCodePoint(low) .. toCodePoint(high)].toUnicode()
131-
}
132-
133-
/** A module computing an equivalent character class for an overly wide range. */
134-
module RangePrinter {
135-
bindingset[char]
136-
bindingset[result]
137-
private string next(string char) {
138-
exists(int prev, int next |
139-
prev.toUnicode() = char and
140-
next.toUnicode() = result and
141-
next = prev + 1
142-
)
143-
}
144-
145-
/** Gets the points where the parts of the pretty printed range should be cut off. */
146-
private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] }
147-
148-
/** Gets the char to use in the low end of a range for a given `cut` */
149-
private string lowCut(string cut) {
150-
cut = ["A", "a", "0"] and
151-
result = cut
152-
or
153-
cut = ["Z", "z", "9"] and
154-
result = next(cut)
155-
}
156-
157-
/** Gets the char to use in the high end of a range for a given `cut` */
158-
private string highCut(string cut) {
159-
cut = ["Z", "z", "9"] and
160-
result = cut
161-
or
162-
cut = ["A", "a", "0"] and
163-
next(result) = cut
164-
}
165-
166-
/** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */
167-
private string cutoff(OverlyWideRange range, int part) {
168-
exists(int low, int high | isRange(range, low, high) |
169-
result =
170-
rank[part + 1](string cut |
171-
cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high
172-
|
173-
cut order by toCodePoint(cut)
174-
)
175-
)
176-
}
177-
178-
/** Gets the number of parts we should print for a given `range`. */
179-
private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) }
180-
181-
/** Holds if the given part of a range should span from `low` to `high`. */
182-
private predicate part(OverlyWideRange range, int part, string low, string high) {
183-
// first part.
184-
part = 0 and
185-
(
186-
range.isRange(low, high) and
187-
parts(range) = 1
188-
or
189-
parts(range) >= 2 and
190-
range.isRange(low, _) and
191-
high = highCut(cutoff(range, part))
192-
)
193-
or
194-
// middle
195-
part >= 1 and
196-
part < parts(range) - 1 and
197-
low = lowCut(cutoff(range, part - 1)) and
198-
high = highCut(cutoff(range, part))
199-
or
200-
// last.
201-
part = parts(range) - 1 and
202-
low = lowCut(cutoff(range, part - 1)) and
203-
range.isRange(_, high)
204-
}
205-
206-
/** Gets an escaped `char` for use in a character class. */
207-
bindingset[char]
208-
private string escape(string char) {
209-
exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" |
210-
if char.regexpMatch(reg) then result = "\\" + char else result = char
211-
)
212-
}
213-
214-
/** Gets a part of the equivalent range. */
215-
private string printEquivalentCharClass(OverlyWideRange range, int part) {
216-
exists(string low, string high | part(range, part, low, high) |
217-
if
218-
isAlphanumeric(low) and
219-
isAlphanumeric(high)
220-
then result = low + "-" + high
221-
else
222-
result =
223-
strictconcat(string char | char = getInRange(low, high) | escape(char) order by char)
224-
)
225-
}
226-
227-
/** Gets the entire pretty printed equivalent range. */
228-
string printEquivalentCharClass(OverlyWideRange range) {
229-
result =
230-
strictconcat(string r, int part |
231-
r = "[" and part = -1 and exists(range)
232-
or
233-
r = printEquivalentCharClass(range, part)
234-
or
235-
r = "]" and part = parts(range)
236-
|
237-
r order by part
238-
)
239-
}
240-
}
241-
242-
/** Gets a char range that is overly large because of `reason`. */
243-
RegExpCharacterRange getABadRange(string reason, int priority) {
244-
result instanceof OverlyWideRange and
245-
priority = 0 and
246-
exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() |
247-
if equiv.length() <= 50
248-
then reason = "is equivalent to " + equiv
249-
else reason = "is equivalent to " + equiv.substring(0, 50) + "..."
250-
)
251-
or
252-
priority = 1 and
253-
exists(RegExpCharacterRange other |
254-
reason = "overlaps with " + other + " in the same character class" and
255-
rankRange(result) < rankRange(other) and
256-
overlap(result, other)
257-
)
258-
or
259-
priority = 2 and
260-
exists(RegExpCharacterClassEscape escape |
261-
reason = "overlaps with " + escape + " in the same character class" and
262-
overlapsWithCharEscape(result, escape)
263-
)
264-
or
265-
reason = "is empty" and
266-
priority = 3 and
267-
exists(int low, int high |
268-
isRange(result, low, high) and
269-
low > high
270-
)
271-
}
272-
273-
/** Holds if `range` matches suspiciously many characters. */
274-
predicate problem(RegExpCharacterRange range, string reason) {
275-
reason =
276-
strictconcat(string m, int priority |
277-
range = getABadRange(m, priority)
278-
|
279-
m, ", and " order by priority desc
280-
) and
281-
// specifying a range using an escape is usually OK.
282-
not range.getAChild() instanceof RegExpEscape and
283-
// Unicode escapes in strings are interpreted before it turns into a regexp,
284-
// so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants.
285-
// We therefore exclude these ranges.
286-
range.getRootTerm().getParent() instanceof RegExpLiteral and
287-
// is used as regexp (mostly for JS where regular expressions are parsed eagerly)
288-
range.getRootTerm().isUsedAsRegExp()
289-
}
5+
private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView
6+
// OverlyLargeRangeQuery should be used directly from the shared pack, and not from this file.
7+
deprecated import codeql.regex.OverlyLargeRangeQuery::Make<TreeView> as Dep
8+
import Dep

0 commit comments

Comments
 (0)