|
| 1 | +/** |
| 2 | + * Classes and predicates for working with suspicious character ranges. |
| 3 | + */ |
| 4 | + |
| 5 | +// We don't need the NFA utils, just the regexp tree. |
| 6 | +// but the below is a nice shared library that exposes the API we need. |
| 7 | +import performance.ReDoSUtil |
| 8 | + |
| 9 | +/** |
| 10 | + * Gets a rank for `range` that is unique for ranges in the same file. |
| 11 | + * Prioritizes ranges that match more characters. |
| 12 | + */ |
| 13 | +int rankRange(RegExpCharacterRange range) { |
| 14 | + range = |
| 15 | + rank[result](RegExpCharacterRange r, Location l, int low, int high | |
| 16 | + r.getLocation() = l and |
| 17 | + isRange(r, low, high) |
| 18 | + | |
| 19 | + r order by (high - low) desc, l.getStartLine(), l.getStartColumn() |
| 20 | + ) |
| 21 | +} |
| 22 | + |
| 23 | +/** Holds if `range` spans from the unicode code points `low` to `high` (both inclusive). */ |
| 24 | +predicate isRange(RegExpCharacterRange range, int low, int high) { |
| 25 | + exists(string lowc, string highc | |
| 26 | + range.isRange(lowc, highc) and |
| 27 | + low.toUnicode() = lowc and |
| 28 | + high.toUnicode() = highc |
| 29 | + ) |
| 30 | +} |
| 31 | + |
| 32 | +/** Holds if `char` is an alpha-numeric character. */ |
| 33 | +predicate isAlphanumeric(string char) { |
| 34 | + // written like this to avoid having a bindingset for the predicate |
| 35 | + char = [[48 .. 57], [65 .. 90], [97 .. 122]].toUnicode() // 0-9, A-Z, a-z |
| 36 | +} |
| 37 | + |
| 38 | +/** |
| 39 | + * Holds if the given ranges are from the same character class |
| 40 | + * and there exists at least one character matched by both ranges. |
| 41 | + */ |
| 42 | +predicate overlap(RegExpCharacterRange a, RegExpCharacterRange b) { |
| 43 | + exists(RegExpCharacterClass clz | |
| 44 | + a = clz.getAChild() and |
| 45 | + b = clz.getAChild() |
| 46 | + | |
| 47 | + // b contains the lower end of a |
| 48 | + exists(int alow, int ahigh, int blow, int bhigh | |
| 49 | + isRange(a, alow, ahigh) and |
| 50 | + isRange(b, blow, bhigh) and |
| 51 | + blow <= alow and |
| 52 | + bhigh >= ahigh |
| 53 | + ) |
| 54 | + or |
| 55 | + // b contains the upper end of a |
| 56 | + exists(int blow, int bhigh, int alow, int ahigh | |
| 57 | + isRange(a, alow, ahigh) and |
| 58 | + isRange(b, blow, bhigh) and |
| 59 | + blow <= ahigh and |
| 60 | + bhigh >= ahigh |
| 61 | + ) |
| 62 | + ) |
| 63 | + or |
| 64 | + // symmetric overlap |
| 65 | + overlap(b, a) |
| 66 | +} |
| 67 | + |
| 68 | +/** |
| 69 | + * Holds if `range` overlaps with the char class `escape` from the same character class. |
| 70 | + */ |
| 71 | +predicate overlapsWithCharEscape(RegExpCharacterRange range, RegExpCharacterClassEscape escape) { |
| 72 | + exists(RegExpCharacterClass clz, string low, string high | |
| 73 | + range = clz.getAChild() and |
| 74 | + escape = clz.getAChild() and |
| 75 | + range.isRange(low, high) |
| 76 | + | |
| 77 | + escape.getValue() = "w" and |
| 78 | + inRange(low, high).regexpMatch("\\w") |
| 79 | + or |
| 80 | + escape.getValue() = "d" and |
| 81 | + inRange(low, high).regexpMatch("\\d") |
| 82 | + or |
| 83 | + escape.getValue() = "s" and |
| 84 | + inRange(low, high).regexpMatch("\\s") |
| 85 | + ) |
| 86 | +} |
| 87 | + |
| 88 | +/** Gets the unicode code point for a `char`. */ |
| 89 | +bindingset[char] |
| 90 | +int toCodePoint(string char) { result.toUnicode() = char } |
| 91 | + |
| 92 | +/** A character range that appears to be overly wide. */ |
| 93 | +class OverlyWideRange extends RegExpCharacterRange { |
| 94 | + OverlyWideRange() { |
| 95 | + exists(int low, int high, int numChars | |
| 96 | + isRange(this, low, high) and |
| 97 | + numChars = (1 + high - low) and |
| 98 | + this.getRootTerm().isUsedAsRegExp() and |
| 99 | + numChars >= 10 |
| 100 | + | |
| 101 | + // across the Z-a range (which includes backticks) |
| 102 | + toCodePoint("Z") >= low and |
| 103 | + toCodePoint("a") <= high |
| 104 | + or |
| 105 | + // across the 9-A range (which includes e.g. ; and ?) |
| 106 | + toCodePoint("9") >= low and |
| 107 | + toCodePoint("A") <= high |
| 108 | + or |
| 109 | + // any non-alpha numeric as part of the range |
| 110 | + not isAlphanumeric([low, high].toUnicode()) |
| 111 | + ) and |
| 112 | + // some cases I want to exclude from being flagged |
| 113 | + not this = allowedWideRanges() |
| 114 | + } |
| 115 | + |
| 116 | + /** Gets a string representation of a character class that matches the same chars as this range. */ |
| 117 | + string printEquivalent() { result = RangePrinter::printEquivalentCharClass(this) } |
| 118 | +} |
| 119 | + |
| 120 | +/** Gets a range that should not be reported as an overly wide range. */ |
| 121 | +RegExpCharacterRange allowedWideRanges() { |
| 122 | + // ~ is the last printable ASCII character, it's used right in various wide ranges. |
| 123 | + result.isRange(_, "~") |
| 124 | + or |
| 125 | + // the same with " " and "!". " " is the first printable character, and "!" is the first non-white-space printable character. |
| 126 | + result.isRange([" ", "!"], _) |
| 127 | + or |
| 128 | + // I've seen this often enough, looks OK. |
| 129 | + result.isRange("@", "_") |
| 130 | + or |
| 131 | + // starting from the zero byte is a good indication that it's purposely matching a large range. |
| 132 | + result.isRange(0.toUnicode(), _) |
| 133 | +} |
| 134 | + |
| 135 | +/** Gets all chars between (and including) `low` and `high`. */ |
| 136 | +bindingset[low, high] |
| 137 | +private string inRange(string low, string high) { |
| 138 | + result = [toCodePoint(low) .. toCodePoint(high)].toUnicode() |
| 139 | +} |
| 140 | + |
| 141 | +/** A module computing an equivalent character class for an overly wide range. */ |
| 142 | +module RangePrinter { |
| 143 | + bindingset[char] |
| 144 | + private string next(string char) { |
| 145 | + exists(int prev, int next | |
| 146 | + prev.toUnicode() = char and |
| 147 | + next.toUnicode() = result and |
| 148 | + next = prev + 1 |
| 149 | + ) |
| 150 | + } |
| 151 | + |
| 152 | + bindingset[char] |
| 153 | + private string prev(string char) { |
| 154 | + exists(int prev, int next | |
| 155 | + prev.toUnicode() = char and |
| 156 | + next.toUnicode() = result and |
| 157 | + next = prev - 1 |
| 158 | + ) |
| 159 | + } |
| 160 | + |
| 161 | + /** Gets the points where the parts of the pretty printed range should be cut off. */ |
| 162 | + private string cutoffs() { result = ["A", "Z", "a", "z", "0", "9"] } |
| 163 | + |
| 164 | + /** Gets the char to use in the low end of a range for a given `cut` */ |
| 165 | + private string lowCut(string cut) { |
| 166 | + cut = ["A", "a", "0"] and |
| 167 | + result = cut |
| 168 | + or |
| 169 | + cut = ["Z", "z", "9"] and |
| 170 | + result = next(cut) |
| 171 | + } |
| 172 | + |
| 173 | + /** Gets the char to use in the high end of a range for a given `cut` */ |
| 174 | + private string highCut(string cut) { |
| 175 | + cut = ["Z", "z", "9"] and |
| 176 | + result = cut |
| 177 | + or |
| 178 | + cut = ["A", "a", "0"] and |
| 179 | + result = prev(cut) |
| 180 | + } |
| 181 | + |
| 182 | + /** Gets the cutoff char used for a given `part` of a range when pretty-printing it. */ |
| 183 | + private string cutoff(OverlyWideRange range, int part) { |
| 184 | + exists(int low, int high | isRange(range, low, high) | |
| 185 | + result = |
| 186 | + rank[part + 1](string cut | |
| 187 | + cut = cutoffs() and low < toCodePoint(cut) and toCodePoint(cut) < high |
| 188 | + | |
| 189 | + cut order by toCodePoint(cut) |
| 190 | + ) |
| 191 | + ) |
| 192 | + } |
| 193 | + |
| 194 | + /** Gets the number of parts we should print for a given `range`. */ |
| 195 | + private int parts(OverlyWideRange range) { result = 1 + strictcount(cutoff(range, _)) } |
| 196 | + |
| 197 | + /** Holds if the given part of a range should span from `low` to `high`. */ |
| 198 | + private predicate part(OverlyWideRange range, int part, string low, string high) { |
| 199 | + // first part. |
| 200 | + part = 0 and |
| 201 | + ( |
| 202 | + range.isRange(low, high) and |
| 203 | + parts(range) = 1 |
| 204 | + or |
| 205 | + parts(range) >= 2 and |
| 206 | + range.isRange(low, _) and |
| 207 | + high = highCut(cutoff(range, part)) |
| 208 | + ) |
| 209 | + or |
| 210 | + // middle |
| 211 | + part >= 1 and |
| 212 | + part < parts(range) and |
| 213 | + low = lowCut(cutoff(range, part - 1)) and |
| 214 | + high = highCut(cutoff(range, part)) |
| 215 | + or |
| 216 | + // last. |
| 217 | + part = parts(range) and |
| 218 | + low = lowCut(cutoff(range, part - 2)) and |
| 219 | + range.isRange(_, high) |
| 220 | + } |
| 221 | + |
| 222 | + /** Gets an escaped `char` for use in a character class. */ |
| 223 | + bindingset[char] |
| 224 | + private string escape(string char) { |
| 225 | + exists(string reg | reg = "(\\[|\\]|\\\\|-|/)" | |
| 226 | + char.regexpMatch(reg) and |
| 227 | + result = "\\" + char |
| 228 | + or |
| 229 | + not char.regexpMatch(reg) and |
| 230 | + result = char |
| 231 | + ) |
| 232 | + } |
| 233 | + |
| 234 | + /** Gets a part of the equivalent range. */ |
| 235 | + private string printEquivalentCharClass(OverlyWideRange range, int part) { |
| 236 | + exists(string low, string high | part(range, part, low, high) | |
| 237 | + if |
| 238 | + isAlphanumeric(low) and |
| 239 | + isAlphanumeric(high) |
| 240 | + then result = low + "-" + high |
| 241 | + else |
| 242 | + result = strictconcat(string char | char = inRange(low, high) | escape(char) order by char) |
| 243 | + ) |
| 244 | + } |
| 245 | + |
| 246 | + /** Gets the entire pretty printed equivalent range. */ |
| 247 | + string printEquivalentCharClass(OverlyWideRange range) { |
| 248 | + result = |
| 249 | + "[" + |
| 250 | + strictconcat(string r, int part | |
| 251 | + r = printEquivalentCharClass(range, part) |
| 252 | + | |
| 253 | + r order by part |
| 254 | + ) + "]" |
| 255 | + } |
| 256 | +} |
| 257 | + |
| 258 | +/** Gets a char range that is suspiciously because of `reason`. */ |
| 259 | +RegExpCharacterRange getABadRange(string reason, int priority) { |
| 260 | + priority = 0 and |
| 261 | + reason = "is equivalent to " + result.(OverlyWideRange).printEquivalent() |
| 262 | + or |
| 263 | + priority = 1 and |
| 264 | + exists(RegExpCharacterRange other | |
| 265 | + reason = "overlaps with " + other + " in the same character class" and |
| 266 | + result != other and |
| 267 | + rankRange(result) < rankRange(other) and |
| 268 | + overlap(result, other) |
| 269 | + ) |
| 270 | + or |
| 271 | + priority = 2 and |
| 272 | + exists(RegExpCharacterClassEscape escape | |
| 273 | + reason = "overlaps with " + escape + " in the same character class" and |
| 274 | + overlapsWithCharEscape(result, escape) |
| 275 | + ) |
| 276 | + or |
| 277 | + reason = "is empty" and |
| 278 | + priority = 3 and |
| 279 | + exists(int low, int high | |
| 280 | + isRange(result, low, high) and |
| 281 | + low > high |
| 282 | + ) |
| 283 | +} |
| 284 | + |
| 285 | +/** Holds if `range` matches suspiciously many characters. */ |
| 286 | +predicate problem(RegExpCharacterRange range, string reason) { |
| 287 | + range = getABadRange(_, _) and |
| 288 | + reason = |
| 289 | + strictconcat(string m, int priority | |
| 290 | + range = getABadRange(m, priority) |
| 291 | + | |
| 292 | + m, ", and " order by priority desc |
| 293 | + ) and |
| 294 | + // specifying a range using an escape is usually OK. |
| 295 | + not range.getAChild() instanceof RegExpEscape and |
| 296 | + // Unicode escapes in strings are interpreted before it turns into a regexp, |
| 297 | + // so e.g. [\u0001-\uFFFF] will just turn up as a range between two constants. |
| 298 | + // We therefore exclude these ranges. |
| 299 | + range.getRootTerm().getParent() instanceof RegExpLiteral and |
| 300 | + // is used as regexp (mostly for JS where regular expressions are parsed eagerly) |
| 301 | + range.getRootTerm().isUsedAsRegExp() |
| 302 | +} |
0 commit comments