|
16 | 16 |
|
17 | 17 | import javascript
|
18 | 18 | import semmle.javascript.security.performance.ReDoSUtil
|
19 |
| - |
20 |
| -/* |
21 |
| - * This query implements the analysis described in the following two papers: |
22 |
| - * |
23 |
| - * James Kirrage, Asiri Rathnayake, Hayo Thielecke: Static Analysis for |
24 |
| - * Regular Expression Denial-of-Service Attacks. NSS 2013. |
25 |
| - * (http://www.cs.bham.ac.uk/~hxt/research/reg-exp-sec.pdf) |
26 |
| - * Asiri Rathnayake, Hayo Thielecke: Static Analysis for Regular Expression |
27 |
| - * Exponential Runtime via Substructural Logics. 2014. |
28 |
| - * (https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf) |
29 |
| - * |
30 |
| - * The basic idea is to search for overlapping cycles in the NFA, that is, |
31 |
| - * states `q` such that there are two distinct paths from `q` to itself |
32 |
| - * that consume the same word `w`. |
33 |
| - * |
34 |
| - * For any such state `q`, an attack string can be constructed as follows: |
35 |
| - * concatenate a prefix `v` that takes the NFA to `q` with `n` copies of |
36 |
| - * the word `w` that leads back to `q` along two different paths, followed |
37 |
| - * by a suffix `x` that is _not_ accepted in state `q`. A backtracking |
38 |
| - * implementation will need to explore at least 2^n different ways of going |
39 |
| - * from `q` back to itself while trying to match the `n` copies of `w` |
40 |
| - * before finally giving up. |
41 |
| - * |
42 |
| - * Now in order to identify overlapping cycles, all we have to do is find |
43 |
| - * pumpable forks, that is, states `q` that can transition to two different |
44 |
| - * states `r1` and `r2` on the same input symbol `c`, such that there are |
45 |
| - * paths from both `r1` and `r2` to `q` that consume the same word. The latter |
46 |
| - * condition is equivalent to saying that `(q, q)` is reachable from `(r1, r2)` |
47 |
| - * in the product NFA. |
48 |
| - * |
49 |
| - * This is what the query does. It makes a simple attempt to construct a |
50 |
| - * prefix `v` leading into `q`, but only to improve the alert message. |
51 |
| - * And the query tries to prove the existence of a suffix that ensures |
52 |
| - * rejection. This check might fail, which can cause false positives. |
53 |
| - * |
54 |
| - * Finally, sometimes it depends on the translation whether the NFA generated |
55 |
| - * for a regular expression has a pumpable fork or not. We implement one |
56 |
| - * particular translation, which may result in false positives or negatives |
57 |
| - * relative to some particular JavaScript engine. |
58 |
| - * |
59 |
| - * More precisely, the query constructs an NFA from a regular expression `r` |
60 |
| - * as follows: |
61 |
| - * |
62 |
| - * * Every sub-term `t` gives rise to an NFA state `Match(t,i)`, representing |
63 |
| - * the state of the automaton before attempting to match the `i`th character in `t`. |
64 |
| - * * There is one accepting state `Accept(r)`. |
65 |
| - * * There is a special `AcceptAnySuffix(r)` state, which accepts any suffix string |
66 |
| - * by using an epsilon transition to `Accept(r)` and an any transition to itself. |
67 |
| - * * Transitions between states may be labelled with epsilon, or an abstract |
68 |
| - * input symbol. |
69 |
| - * * Each abstract input symbol represents a set of concrete input characters: |
70 |
| - * either a single character, a set of characters represented by a |
71 |
| - * character class, or the set of all characters. |
72 |
| - * * The product automaton is constructed lazily, starting with pair states |
73 |
| - * `(q, q)` where `q` is a fork, and proceding along an over-approximate |
74 |
| - * step relation. |
75 |
| - * * The over-approximate step relation allows transitions along pairs of |
76 |
| - * abstract input symbols where the symbols have overlap in the characters they accept. |
77 |
| - * * Once a trace of pairs of abstract input symbols that leads from a fork |
78 |
| - * back to itself has been identified, we attempt to construct a concrete |
79 |
| - * string corresponding to it, which may fail. |
80 |
| - * * Lastly we ensure that any state reached by repeating `n` copies of `w` has |
81 |
| - * a suffix `x` (possible empty) that is most likely __not__ accepted. |
82 |
| - */ |
83 |
| - |
84 |
| -/** |
85 |
| - * Holds if state `s` might be inside a backtracking repetition. |
86 |
| - */ |
87 |
| -pragma[noinline] |
88 |
| -predicate stateInsideBacktracking(State s) { |
89 |
| - s.getRepr().getParent*() instanceof MaybeBacktrackingRepetition |
90 |
| -} |
91 |
| - |
92 |
| -/** |
93 |
| - * A infinitely repeating quantifier that might backtrack. |
94 |
| - */ |
95 |
| -class MaybeBacktrackingRepetition extends InfiniteRepetitionQuantifier { |
96 |
| - MaybeBacktrackingRepetition() { |
97 |
| - exists(RegExpTerm child | |
98 |
| - child instanceof RegExpAlt or |
99 |
| - child instanceof RegExpQuantifier |
100 |
| - | |
101 |
| - child.getParent+() = this |
102 |
| - ) |
103 |
| - } |
104 |
| -} |
105 |
| - |
106 |
| -/** |
107 |
| - * A state in the product automaton. |
108 |
| - * |
109 |
| - * We lazily only construct those states that we are actually |
110 |
| - * going to need: `(q, q)` for every fork state `q`, and any |
111 |
| - * pair of states that can be reached from a pair that we have |
112 |
| - * already constructed. To cut down on the number of states, |
113 |
| - * we only represent states `(q1, q2)` where `q1` is lexicographically |
114 |
| - * no bigger than `q2`. |
115 |
| - * |
116 |
| - * States are only constructed if both states in the pair are |
117 |
| - * inside a repetition that might backtrack. |
118 |
| - */ |
119 |
| -newtype TStatePair = |
120 |
| - MkStatePair(State q1, State q2) { |
121 |
| - isFork(q1, _, _, _, _) and q2 = q1 |
122 |
| - or |
123 |
| - (step(_, _, _, q1, q2) or step(_, _, _, q2, q1)) and |
124 |
| - rankState(q1) <= rankState(q2) |
125 |
| - } |
126 |
| - |
127 |
| -/** |
128 |
| - * Gets a unique number for a `state`. |
129 |
| - * Is used to create an ordering of states, where states with the same `toString()` will be ordered differently. |
130 |
| - */ |
131 |
| -int rankState(State state) { |
132 |
| - state = |
133 |
| - rank[result](State s, Location l | |
134 |
| - l = s.getRepr().getLocation() |
135 |
| - | |
136 |
| - s order by l.getStartLine(), l.getStartColumn(), s.toString() |
137 |
| - ) |
138 |
| -} |
139 |
| - |
140 |
| -class StatePair extends TStatePair { |
141 |
| - State q1; |
142 |
| - State q2; |
143 |
| - |
144 |
| - StatePair() { this = MkStatePair(q1, q2) } |
145 |
| - |
146 |
| - string toString() { result = "(" + q1 + ", " + q2 + ")" } |
147 |
| - |
148 |
| - State getLeft() { result = q1 } |
149 |
| - |
150 |
| - State getRight() { result = q2 } |
151 |
| -} |
152 |
| - |
153 |
| -predicate isStatePair(StatePair p) { any() } |
154 |
| - |
155 |
| -predicate delta2(StatePair q, StatePair r) { step(q, _, _, r) } |
156 |
| - |
157 |
| -/** |
158 |
| - * Gets the minimum length of a path from `q` to `r` in the |
159 |
| - * product automaton. |
160 |
| - */ |
161 |
| -int statePairDist(StatePair q, StatePair r) = |
162 |
| - shortestDistances(isStatePair/1, delta2/2)(q, r, result) |
163 |
| - |
164 |
| -/** |
165 |
| - * Holds if there are transitions from `q` to `r1` and from `q` to `r2` |
166 |
| - * labelled with `s1` and `s2`, respectively, where `s1` and `s2` do not |
167 |
| - * trivially have an empty intersection. |
168 |
| - * |
169 |
| - * This predicate only holds for states associated with regular expressions |
170 |
| - * that have at least one repetition quantifier in them (otherwise the |
171 |
| - * expression cannot be vulnerable to ReDoS attacks anyway). |
172 |
| - */ |
173 |
| -pragma[noopt] |
174 |
| -predicate isFork(State q, InputSymbol s1, InputSymbol s2, State r1, State r2) { |
175 |
| - stateInsideBacktracking(q) and |
176 |
| - exists(State q1, State q2 | |
177 |
| - q1 = epsilonSucc*(q) and |
178 |
| - delta(q1, s1, r1) and |
179 |
| - q2 = epsilonSucc*(q) and |
180 |
| - delta(q2, s2, r2) and |
181 |
| - // Use pragma[noopt] to prevent intersect(s1,s2) from being the starting point of the join. |
182 |
| - // From (s1,s2) it would find a huge number of intermediate state pairs (q1,q2) originating from different literals, |
183 |
| - // and discover at the end that no `q` can reach both `q1` and `q2` by epsilon transitions. |
184 |
| - exists(intersect(s1, s2)) |
185 |
| - | |
186 |
| - s1 != s2 |
187 |
| - or |
188 |
| - r1 != r2 |
189 |
| - or |
190 |
| - r1 = r2 and q1 != q2 |
191 |
| - or |
192 |
| - // If q can reach itself by epsilon transitions, then there are two distinct paths to the q1/q2 state: |
193 |
| - // one that uses the loop and one that doesn't. The engine will separately attempt to match with each path, |
194 |
| - // despite ending in the same state. The "fork" thus arises from the choice of whether to use the loop or not. |
195 |
| - // To avoid every state in the loop becoming a fork state, |
196 |
| - // we arbitrarily pick the InfiniteRepetitionQuantifier state as the canonical fork state for the loop |
197 |
| - // (every epsilon-loop must contain such a state). |
198 |
| - // |
199 |
| - // We additionally require that the there exists another InfiniteRepetitionQuantifier `mid` on the path from `q` to itself. |
200 |
| - // This is done to avoid flagging regular expressions such as `/(a?)*b/` - that only has polynomial runtime, and is detected by `js/polynomial-redos`. |
201 |
| - // The below code is therefore a heuritic, that only flags regular expressions such as `/(a*)*b/`, |
202 |
| - // and does not flag regular expressions such as `/(a?b?)c/`, but the latter pattern is not used frequently. |
203 |
| - r1 = r2 and |
204 |
| - q1 = q2 and |
205 |
| - epsilonSucc+(q) = q and |
206 |
| - exists(RegExpTerm term | term = q.getRepr() | term instanceof InfiniteRepetitionQuantifier) and |
207 |
| - // One of the mid states is an infinite quantifier itself |
208 |
| - exists(State mid, RegExpTerm term | |
209 |
| - mid = epsilonSucc+(q) and |
210 |
| - term = mid.getRepr() and |
211 |
| - term instanceof InfiniteRepetitionQuantifier and |
212 |
| - q = epsilonSucc+(mid) and |
213 |
| - not mid = q |
214 |
| - ) |
215 |
| - ) and |
216 |
| - stateInsideBacktracking(r1) and |
217 |
| - stateInsideBacktracking(r2) |
218 |
| -} |
219 |
| - |
220 |
| -/** |
221 |
| - * Gets the state pair `(q1, q2)` or `(q2, q1)`; note that only |
222 |
| - * one or the other is defined. |
223 |
| - */ |
224 |
| -StatePair mkStatePair(State q1, State q2) { |
225 |
| - result = MkStatePair(q1, q2) or result = MkStatePair(q2, q1) |
226 |
| -} |
227 |
| - |
228 |
| -/** |
229 |
| - * Holds if there are transitions from the components of `q` to the corresponding |
230 |
| - * components of `r` labelled with `s1` and `s2`, respectively. |
231 |
| - */ |
232 |
| -predicate step(StatePair q, InputSymbol s1, InputSymbol s2, StatePair r) { |
233 |
| - exists(State r1, State r2 | step(q, s1, s2, r1, r2) and r = mkStatePair(r1, r2)) |
234 |
| -} |
235 |
| - |
236 |
| -/** |
237 |
| - * Holds if there are transitions from the components of `q` to `r1` and `r2` |
238 |
| - * labelled with `s1` and `s2`, respectively. |
239 |
| - * |
240 |
| - * We only consider transitions where the resulting states `(r1, r2)` are both |
241 |
| - * inside a repetition that might backtrack. |
242 |
| - */ |
243 |
| -pragma[noopt] |
244 |
| -predicate step(StatePair q, InputSymbol s1, InputSymbol s2, State r1, State r2) { |
245 |
| - exists(State q1, State q2 | q.getLeft() = q1 and q.getRight() = q2 | |
246 |
| - deltaClosed(q1, s1, r1) and |
247 |
| - deltaClosed(q2, s2, r2) and |
248 |
| - // use noopt to force the join on `intersect` to happen last. |
249 |
| - exists(intersect(s1, s2)) |
250 |
| - ) and |
251 |
| - stateInsideBacktracking(r1) and |
252 |
| - stateInsideBacktracking(r2) |
253 |
| -} |
254 |
| - |
255 |
| -private newtype TTrace = |
256 |
| - Nil() or |
257 |
| - Step(InputSymbol s1, InputSymbol s2, TTrace t) { |
258 |
| - exists(StatePair p | |
259 |
| - isReachableFromFork(_, p, t, _) and |
260 |
| - step(p, s1, s2, _) |
261 |
| - ) |
262 |
| - or |
263 |
| - t = Nil() and isFork(_, s1, s2, _, _) |
264 |
| - } |
265 |
| - |
266 |
| -/** |
267 |
| - * A list of pairs of input symbols that describe a path in the product automaton |
268 |
| - * starting from some fork state. |
269 |
| - */ |
270 |
| -class Trace extends TTrace { |
271 |
| - string toString() { |
272 |
| - this = Nil() and result = "Nil()" |
273 |
| - or |
274 |
| - exists(InputSymbol s1, InputSymbol s2, Trace t | this = Step(s1, s2, t) | |
275 |
| - result = "Step(" + s1 + ", " + s2 + ", " + t + ")" |
276 |
| - ) |
277 |
| - } |
278 |
| -} |
279 |
| - |
280 |
| -/** |
281 |
| - * Gets a string corresponding to the trace `t`. |
282 |
| - */ |
283 |
| -string concretise(Trace t) { |
284 |
| - t = Nil() and result = "" |
285 |
| - or |
286 |
| - exists(InputSymbol s1, InputSymbol s2, Trace rest | t = Step(s1, s2, rest) | |
287 |
| - result = concretise(rest) + intersect(s1, s2) |
288 |
| - ) |
289 |
| -} |
290 |
| - |
291 |
| -/** |
292 |
| - * Holds if `r` is reachable from `(fork, fork)` under input `w`, and there is |
293 |
| - * a path from `r` back to `(fork, fork)` with `rem` steps. |
294 |
| - */ |
295 |
| -predicate isReachableFromFork(State fork, StatePair r, Trace w, int rem) { |
296 |
| - // base case |
297 |
| - exists(InputSymbol s1, InputSymbol s2, State q1, State q2 | |
298 |
| - isFork(fork, s1, s2, q1, q2) and |
299 |
| - r = MkStatePair(q1, q2) and |
300 |
| - w = Step(s1, s2, Nil()) and |
301 |
| - rem = statePairDist(r, MkStatePair(fork, fork)) |
302 |
| - ) |
303 |
| - or |
304 |
| - // recursive case |
305 |
| - exists(StatePair p, Trace v, InputSymbol s1, InputSymbol s2 | |
306 |
| - isReachableFromFork(fork, p, v, rem + 1) and |
307 |
| - step(p, s1, s2, r) and |
308 |
| - w = Step(s1, s2, v) and |
309 |
| - rem >= statePairDist(r, MkStatePair(fork, fork)) |
310 |
| - ) |
311 |
| -} |
312 |
| - |
313 |
| -/** |
314 |
| - * Gets a state in the product automaton from which `(fork, fork)` is |
315 |
| - * reachable in zero or more epsilon transitions. |
316 |
| - */ |
317 |
| -StatePair getAForkPair(State fork) { |
318 |
| - isFork(fork, _, _, _, _) and |
319 |
| - result = MkStatePair(epsilonPred*(fork), epsilonPred*(fork)) |
320 |
| -} |
321 |
| - |
322 |
| -/** |
323 |
| - * Holds if `fork` is a pumpable fork with word `w`. |
324 |
| - */ |
325 |
| -predicate isPumpable(State fork, string w) { |
326 |
| - exists(StatePair q, Trace t | |
327 |
| - isReachableFromFork(fork, q, t, _) and |
328 |
| - q = getAForkPair(fork) and |
329 |
| - w = concretise(t) |
330 |
| - ) |
331 |
| -} |
332 |
| - |
333 |
| -/** |
334 |
| - * An instantiation of `ReDoSConfiguration` for exponential backtracking. |
335 |
| - */ |
336 |
| -class ExponentialReDoSConfiguration extends ReDoSConfiguration { |
337 |
| - ExponentialReDoSConfiguration() { this = "ExponentialReDoSConfiguration" } |
338 |
| - |
339 |
| - override predicate isReDoSCandidate(State state, string pump) { isPumpable(state, pump) } |
340 |
| -} |
| 19 | +import semmle.javascript.security.performance.ExponentialBackTracking |
341 | 20 |
|
342 | 21 | from RegExpTerm t, string pump, State s, string prefixMsg
|
343 | 22 | where hasReDoSResult(t, pump, s, prefixMsg)
|
|
0 commit comments