Skip to content

Commit dbeae9a

Browse files
committed
make a parameterized module out of the RegexpMatching implementation
1 parent 7fb3d81 commit dbeae9a

File tree

7 files changed

+626
-570
lines changed

7 files changed

+626
-570
lines changed

config/identical-files.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,11 @@
501501
"ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll",
502502
"java/ql/lib/semmle/code/java/security/performance/SuperlinearBackTracking.qll"
503503
],
504+
"RegexpMatching Python/JS/Ruby": [
505+
"javascript/ql/lib/semmle/javascript/security/performance/RegexpMatching.qll",
506+
"python/ql/lib/semmle/python/security/performance/RegexpMatching.qll",
507+
"ruby/ql/lib/codeql/ruby/security/performance/RegexpMatching.qll"
508+
],
504509
"BadTagFilterQuery Python/JS/Ruby": [
505510
"javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
506511
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",

javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll

Lines changed: 32 additions & 190 deletions
Original file line numberDiff line numberDiff line change
@@ -2,196 +2,25 @@
22
* Provides precicates for reasoning about bad tag filter vulnerabilities.
33
*/
44

5-
import performance.ReDoSUtil
5+
import performance.RegexpMatching
66

77
/**
8-
* A module for determining if a regexp matches a given string,
9-
* and reasoning about which capture groups are filled by a given string.
8+
* Holds if the regexp `root` should be tested against `str`.
9+
* Implements the `isRegexpMatchingCandidateSig` signature from `RegexpMatching`.
1010
*/
11-
private module RegexpMatching {
12-
/**
13-
* A class to test whether a regular expression matches a string.
14-
* Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
15-
* The result can afterwards be read from the `matches` predicate.
16-
*
17-
* Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
18-
* The result is available in the `fillCaptureGroup` predicate.
19-
*/
20-
abstract class MatchedRegExp extends RegExpTerm {
21-
MatchedRegExp() { this.isRootTerm() }
22-
23-
/**
24-
* Holds if it should be tested whether this regular expression matches `str`.
25-
*
26-
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
27-
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
28-
* but if `ignorePrefix` is true, it will only match "foo".
29-
*/
30-
predicate test(string str, boolean ignorePrefix) {
31-
none() // maybe overridden in subclasses
32-
}
33-
34-
/**
35-
* Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
36-
*/
37-
predicate testWithGroups(string str, boolean ignorePrefix) {
38-
none() // maybe overridden in subclasses
39-
}
40-
41-
/**
42-
* Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
43-
*/
44-
final predicate matches(string str) {
45-
exists(State state | state = getAState(this, str.length() - 1, str, _) |
46-
epsilonSucc*(state) = Accept(_)
47-
)
48-
}
49-
50-
/**
51-
* Holds if matching `str` may fill capture group number `g`.
52-
* Only holds if `str` is in the `testWithGroups` predicate.
53-
*/
54-
final predicate fillsCaptureGroup(string str, int g) {
55-
exists(State s |
56-
s = getAStateThatReachesAccept(this, _, str, _) and
57-
g = group(s.getRepr())
58-
)
59-
}
60-
}
61-
62-
/**
63-
* Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
64-
* The regular expression is modeled as a non-determistic finite automaton,
65-
* the regular expression can therefore be in multiple states after matching a character.
66-
*
67-
* It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
68-
*/
69-
private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
70-
// start state, the -1 position before any chars have been matched
71-
i = -1 and
72-
(
73-
reg.test(str, ignorePrefix)
74-
or
75-
reg.testWithGroups(str, ignorePrefix)
76-
) and
77-
result.getRepr().getRootTerm() = reg and
78-
isStartState(result)
79-
or
80-
// recursive case
81-
result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
82-
}
83-
84-
/**
85-
* Gets the next state after the `prev` state from `reg`.
86-
* `prev` is the state after matching `fromIndex` chars in `str`,
87-
* and the result is the state after matching `toIndex` chars in `str`.
88-
*
89-
* This predicate is used as a step relation in the forwards search (`getAState`),
90-
* and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
91-
*/
92-
private State getAStateAfterMatching(
93-
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
94-
) {
95-
// the basic recursive case - outlined into a noopt helper to make performance work out.
96-
result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
97-
or
98-
// we can skip past word boundaries if the next char is a non-word char.
99-
fromIndex = toIndex and
100-
prev.getRepr() instanceof RegExpWordBoundary and
101-
prev = getAState(reg, toIndex, str, ignorePrefix) and
102-
after(prev.getRepr()) = result and
103-
str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
104-
}
105-
106-
pragma[noopt]
107-
private State getAStateAfterMatchingAux(
108-
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
109-
) {
110-
prev = getAState(reg, fromIndex, str, ignorePrefix) and
111-
fromIndex = toIndex - 1 and
112-
exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
113-
not discardedPrefixStep(prev, result, ignorePrefix)
114-
}
115-
116-
/** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
117-
private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
118-
prev = mkMatch(any(RegExpRoot r)) and
119-
ignorePrefix = true and
120-
next = prev
121-
}
122-
123-
// The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
124-
private predicate specializedDeltaClosed(State prev, string char, State next) {
125-
deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
126-
}
127-
128-
// The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
129-
pragma[noinline]
130-
private InputSymbol specializedGetAnInputSymbolMatching(string char) {
131-
exists(string s, MatchedRegExp r |
132-
r.test(s, _)
133-
or
134-
r.testWithGroups(s, _)
135-
|
136-
char = s.charAt(_)
137-
) and
138-
result = getAnInputSymbolMatching(char)
139-
}
140-
141-
/**
142-
* Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
143-
* Starts with an accepting state as found by `getAState` and searches backwards
144-
* to the start state through the reachable states (as found by `getAState`).
145-
*
146-
* This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
147-
* and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
148-
* The result state is therefore always on a valid path where `reg` accepts `str`.
149-
*
150-
* This predicate is only used to find which capture groups a regular expression has filled,
151-
* and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
152-
*/
153-
private State getAStateThatReachesAccept(
154-
MatchedRegExp reg, int i, string str, boolean ignorePrefix
155-
) {
156-
// base case, reaches an accepting state from the last state in `getAState(..)`
157-
reg.testWithGroups(str, ignorePrefix) and
158-
i = str.length() - 1 and
159-
result = getAState(reg, i, str, ignorePrefix) and
160-
epsilonSucc*(result) = Accept(_)
11+
private predicate isBadTagFilterCandidate(
12+
RootTerm root, string str, boolean ignorePrefix, boolean testWithGroups
13+
) {
14+
// the regexp must mention "<" and ">" explicitly.
15+
forall(string angleBracket | angleBracket = ["<", ">"] |
16+
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
17+
root
18+
) and
19+
ignorePrefix = true and
20+
(
21+
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"] and
22+
testWithGroups = true
16123
or
162-
// recursive case. `next` is the next state to be matched after matching `prev`.
163-
// this predicate is doing a backwards search, so `prev` is the result we are looking for.
164-
exists(State next, State prev, int fromIndex, int toIndex |
165-
next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
166-
next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
167-
i = fromIndex and
168-
result = prev
169-
)
170-
}
171-
172-
/** Gets the capture group number that `term` belongs to. */
173-
private int group(RegExpTerm term) {
174-
exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
175-
}
176-
}
177-
178-
/** A class to test whether a regular expression matches certain HTML tags. */
179-
class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
180-
HtmlMatchingRegExp() {
181-
// the regexp must mention "<" and ">" explicitly.
182-
forall(string angleBracket | angleBracket = ["<", ">"] |
183-
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
184-
this
185-
)
186-
}
187-
188-
override predicate testWithGroups(string str, boolean ignorePrefix) {
189-
ignorePrefix = true and
190-
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
191-
}
192-
193-
override predicate test(string str, boolean ignorePrefix) {
194-
ignorePrefix = true and
19524
str =
19625
[
19726
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
@@ -200,12 +29,25 @@ class HtmlMatchingRegExp extends RegexpMatching::MatchedRegExp {
20029
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
20130
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
20231
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
203-
]
204-
}
32+
] and
33+
testWithGroups = false
34+
)
20535
}
20636

207-
/** DEPRECATED: Alias for HtmlMatchingRegExp */
208-
deprecated class HTMLMatchingRegExp = HtmlMatchingRegExp;
37+
/**
38+
* A regexp that matches some string from the `isBadTagFilterCandidate` predicate.
39+
*/
40+
class HtmlMatchingRegExp extends RootTerm {
41+
HtmlMatchingRegExp() { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, _) }
42+
43+
/** Holds if this regexp matched `str`, where `str` is one of the string from `isBadTagFilterCandidate`. */
44+
predicate matches(string str) { RegexpMatching<isBadTagFilterCandidate/4>::matches(this, str) }
45+
46+
/** Holds if this regexp fills capture group `g' when matching `str', where `str` is one of the string from `isBadTagFilterCandidate`. */
47+
predicate fillsCaptureGroup(string str, int g) {
48+
RegexpMatching<isBadTagFilterCandidate/4>::fillsCaptureGroup(this, str, g)
49+
}
50+
}
20951

21052
/**
21153
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.

0 commit comments

Comments
 (0)