|
2 | 2 | * Classes and predicates for working with suspicious character ranges.
|
3 | 3 | */
|
4 | 4 |
|
5 |
| -// We don't need the NFA utils, just the regexp tree. |
6 |
| -// but the below is a nice shared library that exposes the API we need. |
7 |
| -import regexp.NfaUtils |
8 |
| - |
9 |
| -/** |
10 |
| - * Gets a rank for `range` that is unique for ranges in the same file. |
11 |
| - * Prioritizes ranges that match more characters. |
12 |
| - */ |
13 |
| -int rankRange(RegExpCharacterRange range) { |
14 |
| - range = |
15 |
| - rank[result](RegExpCharacterRange r, Location l, int low, int high | |
16 |
| - r.getLocation() = l and |
17 |
| - isRange(r, low, high) |
18 |
| - | |
19 |
| - r order by (high - low) desc, l.getStartLine(), l.getStartColumn() |
20 |
| - ) |
21 |
| -} |
22 |
| - |
23 |
| -/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ |
24 |
| -predicate isRange(RegExpCharacterRange range, int low, int high) { |
25 |
| - exists(string lowc, string highc | |
26 |
| - range.isRange(lowc, highc) and |
27 |
| - low.toUnicode() = lowc and |
28 |
| - high.toUnicode() = highc |
29 |
| - ) |
30 |
| -} |
31 |
| - |
32 |
| -/** Holds if `char` is an alpha-numeric character. */ |
33 |
| -predicate isAlphanumeric(string char) { |
34 |
| - // written like this to avoid having a bindingset for the predicate |
35 |
| - char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z |
36 |
| -} |
37 |
| - |
38 |
| -/** |
39 |
| - * Holds if the given ranges are from the same character class |
40 |
| - * and there exists at least one character matched by both ranges. |
41 |
| - */ |
42 |
| -predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { |
43 |
| - exists(RegExpCharacterClass clz | |
44 |
| - a = clz.getAChild() and |
45 |
| - b = clz.getAChild() and |
46 |
| - a != b |
47 |
| - | |
48 |
| - exists(int alow, int ahigh, int blow, int bhigh | |
49 |
| - isRange(a, alow, ahigh) and |
50 |
| - isRange(b, blow, bhigh) and |
51 |
| - alow <= bhigh and |
52 |
| - blow <= ahigh |
53 |
| - ) |
54 |
| - ) |
55 |
| -} |
56 |
| - |
57 |
| -/** |
58 |
| - * Holds if `range` overlaps with the char class `escape` from the same character class. |
59 |
| - */ |
60 |
| -predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { |
61 |
| - exists(RegExpCharacterClass clz, string low, string high | |
62 |
| - range = clz.getAChild() and |
63 |
| - escape = clz.getAChild() and |
64 |
| - range.isRange(low, high) |
65 |
| - | |
66 |
| - escape.getValue() = "w" and |
67 |
| - getInRange(low, high).regexpMatch("\\w") |
68 |
| - or |
69 |
| - escape.getValue() = "d" and |
70 |
| - getInRange(low, high).regexpMatch("\\d") |
71 |
| - or |
72 |
| - escape.getValue() = "s" and |
73 |
| - getInRange(low, high).regexpMatch("\\s") |
74 |
| - ) |
75 |
| -} |
76 |
| - |
77 |
| -/** Gets the unicode code point for a `char`. */ |
78 |
| -bindingset[char] |
79 |
| -int toCodePoint(string char) { result.toUnicode() = char } |
80 |
| - |
81 |
| -/** A character range that appears to be overly wide. */ |
82 |
| -class OverlyWideRange extends RegExpCharacterRange { |
83 |
| - OverlyWideRange() { |
84 |
| - exists(int low, int high, int numChars | |
85 |
| - isRange(this, low, high) and |
86 |
| - numChars = (1 + high - low) and |
87 |
| - this.getRootTerm().isUsedAsRegExp() and |
88 |
| - numChars >= 10 |
89 |
| - | |
90 |
| - // across the Z-a range (which includes backticks) |
91 |
| - toCodePoint("Z") >= low and |
92 |
| - toCodePoint("a") <= high |
93 |
| - or |
94 |
| - // across the 9-A range (which includes e.g. ; and ?) |
95 |
| - toCodePoint("9") >= low and |
96 |
| - toCodePoint("A") <= high |
97 |
| - or |
98 |
| - // a non-alphanumeric char as part of the range boundaries |
99 |
| - exists(int bound | bound = [low, high] | not isAlphanumeric(bound.toUnicode())) and |
100 |
| - // while still being ascii |
101 |
| - low < 128 and |
102 |
| - high < 128 |
103 |
| - ) and |
104 |
| - // allowlist for known ranges |
105 |
| - not this = allowedWideRanges() |
106 |
| - } |
107 |
| - |
108 |
| - /** Gets a string representation of a character class that matches the same chars as this range. */ |
109 |
| - string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } |
110 |
| -} |
111 |
| - |
112 |
| -/** Gets a range that should not be reported as an overly wide range. */ |
113 |
| -RegExpCharacterRange allowedWideRanges() { |
114 |
| - // ~ is the last printable ASCII character, it's used right in various wide ranges. |
115 |
| - result.isRange(_, "~") |
116 |
| - or |
117 |
| - // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. |
118 |
| - result.isRange([" ", "!"], _) |
119 |
| - or |
120 |
| - // the `[@-_]` range is intentional |
121 |
| - result.isRange("@", "_") |
122 |
| - or |
123 |
| - // starting from the zero byte is a good indication that it's purposely matching a large range. |
124 |
| - result.isRange(0.toUnicode(), _) |
125 |
| -} |
126 |
| - |
127 |
| -/** Gets a char between (and including) `low` and `high`. */ |
128 |
| -bindingset[low, high] |
129 |
| -private string getInRange(string low, string high) { |
130 |
| - result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() |
131 |
| -} |
132 |
| - |
133 |
| -/** A module computing an equivalent character class for an overly wide range. */ |
134 |
| -module RangePrinter { |
135 |
| - bindingset[char] |
136 |
| - bindingset[result] |
137 |
| - private string next(string char) { |
138 |
| - exists(int prev, int next | |
139 |
| - prev.toUnicode() = char and |
140 |
| - next.toUnicode() = result and |
141 |
| - next = prev + 1 |
142 |
| - ) |
143 |
| - } |
144 |
| - |
145 |
| - /** Gets the points where the parts of the pretty printed range should be cut off. */ |
146 |
| - private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } |
147 |
| - |
148 |
| - /** Gets the char to use in the low end of a range for a given `cut` */ |
149 |
| - private string lowCut(string cut) { |
150 |
| - cut = ["A", "a", "0"] and |
151 |
| - result = cut |
152 |
| - or |
153 |
| - cut = ["Z", "z", "9"] and |
154 |
| - result = next(cut) |
155 |
| - } |
156 |
| - |
157 |
| - /** Gets the char to use in the high end of a range for a given `cut` */ |
158 |
| - private string highCut(string cut) { |
159 |
| - cut = ["Z", "z", "9"] and |
160 |
| - result = cut |
161 |
| - or |
162 |
| - cut = ["A", "a", "0"] and |
163 |
| - next(result) = cut |
164 |
| - } |
165 |
| - |
166 |
| - /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ |
167 |
| - private string cutoff(OverlyWideRange range, int part) { |
168 |
| - exists(int low, int high | isRange(range, low, high) | |
169 |
| - result = |
170 |
| - rank[part + 1](string cut | |
171 |
| - cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high |
172 |
| - | |
173 |
| - cut order by toCodePoint(cut) |
174 |
| - ) |
175 |
| - ) |
176 |
| - } |
177 |
| - |
178 |
| - /** Gets the number of parts we should print for a given `range`. */ |
179 |
| - private int parts(OverlyWideRange range) { result = 1 + count(cutoff(range, _)) } |
180 |
| - |
181 |
| - /** Holds if the given part of a range should span from `low` to `high`. */ |
182 |
| - private predicate part(OverlyWideRange range, int part, string low, string high) { |
183 |
| - // first part. |
184 |
| - part = 0 and |
185 |
| - ( |
186 |
| - range.isRange(low, high) and |
187 |
| - parts(range) = 1 |
188 |
| - or |
189 |
| - parts(range) >= 2 and |
190 |
| - range.isRange(low, _) and |
191 |
| - high = highCut(cutoff(range, part)) |
192 |
| - ) |
193 |
| - or |
194 |
| - // middle |
195 |
| - part >= 1 and |
196 |
| - part < parts(range) - 1 and |
197 |
| - low = lowCut(cutoff(range, part - 1)) and |
198 |
| - high = highCut(cutoff(range, part)) |
199 |
| - or |
200 |
| - // last. |
201 |
| - part = parts(range) - 1 and |
202 |
| - low = lowCut(cutoff(range, part - 1)) and |
203 |
| - range.isRange(_, high) |
204 |
| - } |
205 |
| - |
206 |
| - /** Gets an escaped `char` for use in a character class. */ |
207 |
| - bindingset[char] |
208 |
| - private string escape(string char) { |
209 |
| - exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | |
210 |
| - if char.regexpMatch(reg) then result = "\\" + char else result = char |
211 |
| - ) |
212 |
| - } |
213 |
| - |
214 |
| - /** Gets a part of the equivalent range. */ |
215 |
| - private string printEquivalentCharClass(OverlyWideRange range, int part) { |
216 |
| - exists(string low, string high | part(range, part, low, high) | |
217 |
| - if |
218 |
| - isAlphanumeric(low) and |
219 |
| - isAlphanumeric(high) |
220 |
| - then result = low + "-" + high |
221 |
| - else |
222 |
| - result = |
223 |
| - strictconcat(string char | char = getInRange(low, high) | escape(char) order by char) |
224 |
| - ) |
225 |
| - } |
226 |
| - |
227 |
| - /** Gets the entire pretty printed equivalent range. */ |
228 |
| - string printEquivalentCharClass(OverlyWideRange range) { |
229 |
| - result = |
230 |
| - strictconcat(string r, int part | |
231 |
| - r = "[" and part = -1 and exists(range) |
232 |
| - or |
233 |
| - r = printEquivalentCharClass(range, part) |
234 |
| - or |
235 |
| - r = "]" and part = parts(range) |
236 |
| - | |
237 |
| - r order by part |
238 |
| - ) |
239 |
| - } |
240 |
| -} |
241 |
| - |
242 |
| -/** Gets a char range that is overly large because of `reason`. */ |
243 |
| -RegExpCharacterRange getABadRange(string reason, int priority) { |
244 |
| - result instanceof OverlyWideRange and |
245 |
| - priority = 0 and |
246 |
| - exists(string equiv | equiv = result.(OverlyWideRange).printEquivalent() | |
247 |
| - if equiv.length() <= 50 |
248 |
| - then reason = "is equivalent to " + equiv |
249 |
| - else reason = "is equivalent to " + equiv.substring(0, 50) + "..." |
250 |
| - ) |
251 |
| - or |
252 |
| - priority = 1 and |
253 |
| - exists(RegExpCharacterRange other | |
254 |
| - reason = "overlaps with " + other + " in the same character class" and |
255 |
| - rankRange(result) < rankRange(other) and |
256 |
| - overlap(result, other) |
257 |
| - ) |
258 |
| - or |
259 |
| - priority = 2 and |
260 |
| - exists(RegExpCharacterClassEscape escape | |
261 |
| - reason = "overlaps with " + escape + " in the same character class" and |
262 |
| - overlapsWithCharEscape(result, escape) |
263 |
| - ) |
264 |
| - or |
265 |
| - reason = "is empty" and |
266 |
| - priority = 3 and |
267 |
| - exists(int low, int high | |
268 |
| - isRange(result, low, high) and |
269 |
| - low > high |
270 |
| - ) |
271 |
| -} |
272 |
| - |
273 |
| -/** Holds if `range` matches suspiciously many characters. */ |
274 |
| -predicate problem(RegExpCharacterRange range, string reason) { |
275 |
| - reason = |
276 |
| - strictconcat(string m, int priority | |
277 |
| - range = getABadRange(m, priority) |
278 |
| - | |
279 |
| - m, ", and " order by priority desc |
280 |
| - ) and |
281 |
| - // specifying a range using an escape is usually OK. |
282 |
| - not range.getAChild() instanceof RegExpEscape and |
283 |
| - // Unicode escapes in strings are interpreted before it turns into a regexp, |
284 |
| - // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. |
285 |
| - // We therefore exclude these ranges. |
286 |
| - range.getRootTerm().getParent() instanceof RegExpLiteral and |
287 |
| - // is used as regexp (mostly for JS where regular expressions are parsed eagerly) |
288 |
| - range.getRootTerm().isUsedAsRegExp() |
289 |
| -} |
| 5 | +private import semmle.code.java.regex.RegexTreeView::RegexTreeView as TreeView |
| 6 | +// OverlyLargeRangeQuery should be used directly from the shared pack, and not from this file. |
| 7 | +deprecated import codeql.regex.OverlyLargeRangeQuery::Make<TreeView> as Dep |
| 8 | +import Dep |
0 commit comments