Skip to content

Commit 97264b5

Browse files
committed
add the bad tag filter query to ruby
1 parent c15ddf6 commit 97264b5

File tree

9 files changed

+435
-3
lines changed

9 files changed

+435
-3
lines changed

config/identical-files.json

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -470,11 +470,12 @@
470470
"ReDoS Polynomial Python/JS": [
471471
"javascript/ql/lib/semmle/javascript/security/performance/SuperlinearBackTracking.qll",
472472
"python/ql/lib/semmle/python/security/performance/SuperlinearBackTracking.qll",
473-
"ruby/ql/lib/codeql/ruby/regexp/SuperlinearBackTracking.qll"
473+
"ruby/ql/lib/codeql/ruby/security/performance/SuperlinearBackTracking.qll"
474474
],
475-
"BadTagFilterQuery Python/JS": [
475+
"BadTagFilterQuery Python/JS/Ruby": [
476476
"javascript/ql/lib/semmle/javascript/security/BadTagFilterQuery.qll",
477-
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll"
477+
"python/ql/lib/semmle/python/security/BadTagFilterQuery.qll",
478+
"ruby/ql/lib/codeql/ruby/security/BadTagFilterQuery.qll"
478479
],
479480
"CFG": [
480481
"csharp/ql/lib/semmle/code/csharp/controlflow/internal/ControlFlowGraphImplShared.qll",
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
/**
2+
* Provides precicates for reasoning about bad tag filter vulnerabilities.
3+
*/
4+
5+
import performance.ReDoSUtil
6+
7+
/**
8+
* A module for determining if a regexp matches a given string,
9+
* and reasoning about which capture groups are filled by a given string.
10+
*/
11+
private module RegexpMatching {
12+
/**
13+
* A class to test whether a regular expression matches a string.
14+
* Override this class and extend `test`/`testWithGroups` to configure which strings should be tested for acceptance by this regular expression.
15+
* The result can afterwards be read from the `matches` predicate.
16+
*
17+
* Strings in the `testWithGroups` predicate are also tested for which capture groups are filled by the given string.
18+
* The result is available in the `fillCaptureGroup` predicate.
19+
*/
20+
abstract class MatchedRegExp extends RegExpTerm {
21+
MatchedRegExp() { this.isRootTerm() }
22+
23+
/**
24+
* Holds if it should be tested whether this regular expression matches `str`.
25+
*
26+
* If `ignorePrefix` is true, then a regexp without a start anchor will be treated as if it had a start anchor.
27+
* E.g. a regular expression `/foo$/` will match any string that ends with "foo",
28+
* but if `ignorePrefix` is true, it will only match "foo".
29+
*/
30+
predicate test(string str, boolean ignorePrefix) {
31+
none() // maybe overriden in subclasses
32+
}
33+
34+
/**
35+
* Same as `test(..)`, but where the `fillsCaptureGroup` afterwards tells which capture groups were filled by the given string.
36+
*/
37+
predicate testWithGroups(string str, boolean ignorePrefix) {
38+
none() // maybe overriden in subclasses
39+
}
40+
41+
/**
42+
* Holds if this RegExp matches `str`, where `str` is either in the `test` or `testWithGroups` predicate.
43+
*/
44+
final predicate matches(string str) {
45+
exists(State state | state = getAState(this, str.length() - 1, str, _) |
46+
epsilonSucc*(state) = Accept(_)
47+
)
48+
}
49+
50+
/**
51+
* Holds if matching `str` may fill capture group number `g`.
52+
* Only holds if `str` is in the `testWithGroups` predicate.
53+
*/
54+
final predicate fillsCaptureGroup(string str, int g) {
55+
exists(State s |
56+
s = getAStateThatReachesAccept(this, _, str, _) and
57+
g = group(s.getRepr())
58+
)
59+
}
60+
}
61+
62+
/**
63+
* Gets a state the regular expression `reg` can be in after matching the `i`th char in `str`.
64+
* The regular expression is modelled as a non-determistic finite automaton,
65+
* the regular expression can therefore be in multiple states after matching a character.
66+
*
67+
* It's a forward search to all possible states, and there is thus no guarantee that the state is on a path to an accepting state.
68+
*/
69+
private State getAState(MatchedRegExp reg, int i, string str, boolean ignorePrefix) {
70+
// start state, the -1 position before any chars have been matched
71+
i = -1 and
72+
(
73+
reg.test(str, ignorePrefix)
74+
or
75+
reg.testWithGroups(str, ignorePrefix)
76+
) and
77+
result.getRepr().getRootTerm() = reg and
78+
isStartState(result)
79+
or
80+
// recursive case
81+
result = getAStateAfterMatching(reg, _, str, i, _, ignorePrefix)
82+
}
83+
84+
/**
85+
* Gets the next state after the `prev` state from `reg`.
86+
* `prev` is the state after matching `fromIndex` chars in `str`,
87+
* and the result is the state after matching `toIndex` chars in `str`.
88+
*
89+
* This predicate is used as a step relation in the forwards search (`getAState`),
90+
* and also as a step relation in the later backwards search (`getAStateThatReachesAccept`).
91+
*/
92+
private State getAStateAfterMatching(
93+
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
94+
) {
95+
// the basic recursive case - outlined into a noopt helper to make performance work out.
96+
result = getAStateAfterMatchingAux(reg, prev, str, toIndex, fromIndex, ignorePrefix)
97+
or
98+
// we can skip past word boundaries if the next char is a non-word char.
99+
fromIndex = toIndex and
100+
prev.getRepr() instanceof RegExpWordBoundary and
101+
prev = getAState(reg, toIndex, str, ignorePrefix) and
102+
after(prev.getRepr()) = result and
103+
str.charAt(toIndex + 1).regexpMatch("\\W") // \W matches any non-word char.
104+
}
105+
106+
pragma[noopt]
107+
private State getAStateAfterMatchingAux(
108+
MatchedRegExp reg, State prev, string str, int toIndex, int fromIndex, boolean ignorePrefix
109+
) {
110+
prev = getAState(reg, fromIndex, str, ignorePrefix) and
111+
fromIndex = toIndex - 1 and
112+
exists(string char | char = str.charAt(toIndex) | specializedDeltaClosed(prev, char, result)) and
113+
not discardedPrefixStep(prev, result, ignorePrefix)
114+
}
115+
116+
/** Holds if a step from `prev` to `next` should be discarded when the `ignorePrefix` flag is set. */
117+
private predicate discardedPrefixStep(State prev, State next, boolean ignorePrefix) {
118+
prev = mkMatch(any(RegExpRoot r)) and
119+
ignorePrefix = true and
120+
next = prev
121+
}
122+
123+
// The `deltaClosed` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
124+
private predicate specializedDeltaClosed(State prev, string char, State next) {
125+
deltaClosed(prev, specializedGetAnInputSymbolMatching(char), next)
126+
}
127+
128+
// The `getAnInputSymbolMatching` relation specialized to the chars that exists in strings tested by a `MatchedRegExp`.
129+
pragma[noinline]
130+
private InputSymbol specializedGetAnInputSymbolMatching(string char) {
131+
exists(string s, MatchedRegExp r |
132+
r.test(s, _)
133+
or
134+
r.testWithGroups(s, _)
135+
|
136+
char = s.charAt(_)
137+
) and
138+
result = getAnInputSymbolMatching(char)
139+
}
140+
141+
/**
142+
* Gets the `i`th state on a path to the accepting state when `reg` matches `str`.
143+
* Starts with an accepting state as found by `getAState` and searches backwards
144+
* to the start state through the reachable states (as found by `getAState`).
145+
*
146+
* This predicate holds the invariant that the result state can be reached with `i` steps from a start state,
147+
* and an accepting state can be found after (`str.length() - 1 - i`) steps from the result.
148+
* The result state is therefore always on a valid path where `reg` accepts `str`.
149+
*
150+
* This predicate is only used to find which capture groups a regular expression has filled,
151+
* and thus the search is only performed for the strings in the `testWithGroups(..)` predicate.
152+
*/
153+
private State getAStateThatReachesAccept(
154+
MatchedRegExp reg, int i, string str, boolean ignorePrefix
155+
) {
156+
// base case, reaches an accepting state from the last state in `getAState(..)`
157+
reg.testWithGroups(str, ignorePrefix) and
158+
i = str.length() - 1 and
159+
result = getAState(reg, i, str, ignorePrefix) and
160+
epsilonSucc*(result) = Accept(_)
161+
or
162+
// recursive case. `next` is the next state to be matched after matching `prev`.
163+
// this predicate is doing a backwards search, so `prev` is the result we are looking for.
164+
exists(State next, State prev, int fromIndex, int toIndex |
165+
next = getAStateThatReachesAccept(reg, toIndex, str, ignorePrefix) and
166+
next = getAStateAfterMatching(reg, prev, str, toIndex, fromIndex, ignorePrefix) and
167+
i = fromIndex and
168+
result = prev
169+
)
170+
}
171+
172+
/** Gets the capture group number that `term` belongs to. */
173+
private int group(RegExpTerm term) {
174+
exists(RegExpGroup grp | grp.getNumber() = result | term.getParent*() = grp)
175+
}
176+
}
177+
178+
/** A class to test whether a regular expression matches certain HTML tags. */
179+
class HTMLMatchingRegExp extends RegexpMatching::MatchedRegExp {
180+
HTMLMatchingRegExp() {
181+
// the regexp must mention "<" and ">" explicitly.
182+
forall(string angleBracket | angleBracket = ["<", ">"] |
183+
any(RegExpConstant term | term.getValue().matches("%" + angleBracket + "%")).getRootTerm() =
184+
this
185+
)
186+
}
187+
188+
override predicate testWithGroups(string str, boolean ignorePrefix) {
189+
ignorePrefix = true and
190+
str = ["<!-- foo -->", "<!-- foo --!>", "<!- foo ->", "<foo>", "<script>"]
191+
}
192+
193+
override predicate test(string str, boolean ignorePrefix) {
194+
ignorePrefix = true and
195+
str =
196+
[
197+
"<!-- foo -->", "<!- foo ->", "<!-- foo --!>", "<!-- foo\n -->", "<script>foo</script>",
198+
"<script \n>foo</script>", "<script >foo\n</script>", "<foo ></foo>", "<foo>",
199+
"<foo src=\"foo\"></foo>", "<script>", "<script src=\"foo\"></script>",
200+
"<script src='foo'></script>", "<SCRIPT>foo</SCRIPT>", "<script\tsrc=\"foo\"/>",
201+
"<script\tsrc='foo'></script>", "<sCrIpT>foo</ScRiPt>", "<script src=\"foo\">foo</script >",
202+
"<script src=\"foo\">foo</script foo=\"bar\">", "<script src=\"foo\">foo</script\t\n bar>"
203+
]
204+
}
205+
}
206+
207+
/**
208+
* Holds if `regexp` matches some HTML tags, but misses some HTML tags that it should match.
209+
*
210+
* When adding a new case to this predicate, make sure the test string used in `matches(..)` calls are present in `HTMLMatchingRegExp::test` / `HTMLMatchingRegExp::testWithGroups`.
211+
*/
212+
predicate isBadRegexpFilter(HTMLMatchingRegExp regexp, string msg) {
213+
// CVE-2021-33829 - matching both "<!-- foo -->" and "<!-- foo --!>", but in different capture groups
214+
regexp.matches("<!-- foo -->") and
215+
regexp.matches("<!-- foo --!>") and
216+
exists(int a, int b | a != b |
217+
regexp.fillsCaptureGroup("<!-- foo -->", a) and
218+
// <!-- foo --> might be ambigously parsed (matching both capture groups), and that is ok here.
219+
regexp.fillsCaptureGroup("<!-- foo --!>", b) and
220+
not regexp.fillsCaptureGroup("<!-- foo --!>", a) and
221+
msg =
222+
"Comments ending with --> are matched differently from comments ending with --!>. The first is matched with capture group "
223+
+ a + " and comments ending with --!> are matched with capture group " +
224+
strictconcat(int i | regexp.fillsCaptureGroup("<!-- foo --!>", i) | i.toString(), ", ") +
225+
"."
226+
)
227+
or
228+
// CVE-2020-17480 - matching "<!-- foo -->" and other tags, but not "<!-- foo --!>".
229+
exists(int group, int other |
230+
group != other and
231+
regexp.fillsCaptureGroup("<!-- foo -->", group) and
232+
regexp.fillsCaptureGroup("<foo>", other) and
233+
not regexp.matches("<!-- foo --!>") and
234+
not regexp.fillsCaptureGroup("<!-- foo -->", any(int i | i != group)) and
235+
not regexp.fillsCaptureGroup("<!- foo ->", group) and
236+
not regexp.fillsCaptureGroup("<foo>", group) and
237+
not regexp.fillsCaptureGroup("<script>", group) and
238+
msg =
239+
"This regular expression only parses --> (capture group " + group +
240+
") and not --!> as a HTML comment end tag."
241+
)
242+
or
243+
regexp.matches("<!-- foo -->") and
244+
not regexp.matches("<!-- foo\n -->") and
245+
not regexp.matches("<!- foo ->") and
246+
not regexp.matches("<foo>") and
247+
not regexp.matches("<script>") and
248+
msg = "This regular expression does not match comments containing newlines."
249+
or
250+
regexp.matches("<script>foo</script>") and
251+
regexp.matches("<script src=\"foo\"></script>") and
252+
not regexp.matches("<foo ></foo>") and
253+
(
254+
not regexp.matches("<script \n>foo</script>") and
255+
msg = "This regular expression matches <script></script>, but not <script \\n></script>"
256+
or
257+
not regexp.matches("<script >foo\n</script>") and
258+
msg = "This regular expression matches <script>...</script>, but not <script >...\\n</script>"
259+
)
260+
or
261+
regexp.matches("<script>foo</script>") and
262+
regexp.matches("<script src=\"foo\"></script>") and
263+
not regexp.matches("<script src='foo'></script>") and
264+
not regexp.matches("<foo>") and
265+
msg = "This regular expression does not match script tags where the attribute uses single-quotes."
266+
or
267+
regexp.matches("<script>foo</script>") and
268+
regexp.matches("<script src='foo'></script>") and
269+
not regexp.matches("<script src=\"foo\"></script>") and
270+
not regexp.matches("<foo>") and
271+
msg = "This regular expression does not match script tags where the attribute uses double-quotes."
272+
or
273+
regexp.matches("<script>foo</script>") and
274+
regexp.matches("<script src='foo'></script>") and
275+
not regexp.matches("<script\tsrc='foo'></script>") and
276+
not regexp.matches("<foo>") and
277+
not regexp.matches("<foo src=\"foo\"></foo>") and
278+
msg = "This regular expression does not match script tags where tabs are used between attributes."
279+
or
280+
regexp.matches("<script>foo</script>") and
281+
not RegExpFlags::isIgnoreCase(regexp) and
282+
not regexp.matches("<foo>") and
283+
not regexp.matches("<foo ></foo>") and
284+
(
285+
not regexp.matches("<SCRIPT>foo</SCRIPT>") and
286+
msg = "This regular expression does not match upper case <SCRIPT> tags."
287+
or
288+
not regexp.matches("<sCrIpT>foo</ScRiPt>") and
289+
regexp.matches("<SCRIPT>foo</SCRIPT>") and
290+
msg = "This regular expression does not match mixed case <sCrIpT> tags."
291+
)
292+
or
293+
regexp.matches("<script src=\"foo\"></script>") and
294+
not regexp.matches("<foo>") and
295+
not regexp.matches("<foo ></foo>") and
296+
(
297+
not regexp.matches("<script src=\"foo\">foo</script >") and
298+
msg = "This regular expression does not match script end tags like </script >."
299+
or
300+
not regexp.matches("<script src=\"foo\">foo</script foo=\"bar\">") and
301+
msg = "This regular expression does not match script end tags like </script foo=\"bar\">."
302+
or
303+
not regexp.matches("<script src=\"foo\">foo</script\t\n bar>") and
304+
msg = "This regular expression does not match script end tags like </script\\t\\n bar>."
305+
)
306+
}

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,13 @@ private int toHex(string hex) {
438438
result = 15 and hex = ["f", "F"]
439439
}
440440

441+
/**
442+
* A word boundary, that is, a regular expression term of the form `\b`.
443+
*/
444+
class RegExpWordBoundary extends RegExpEscape {
445+
RegExpWordBoundary() { this.getUnescaped() = "b" }
446+
}
447+
441448
/**
442449
* A character class escape in a regular expression.
443450
* That is, an escaped character that denotes multiple characters.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
It is possible to match some single HTML tags using regular expressions (parsing general HTML using
9+
regular expressions is impossible). However, if the regular expression is not written well it might
10+
be possible to circumvent it, which can lead to cross-site scripting or other security issues.
11+
</p>
12+
<p>
13+
Some of these mistakes are caused by browsers having very forgiving HTML parsers, and
14+
will often render invalid HTML containing syntax errors.
15+
Regular expressions that attempt to match HTML should also recognize tags containing such syntax errors.
16+
</p>
17+
</overview>
18+
19+
<recommendation>
20+
<p>
21+
Use a well-tested sanitization or parser library if at all possible. These libraries are much more
22+
likely to handle corner cases correctly than a custom implementation.
23+
</p>
24+
</recommendation>
25+
26+
<example>
27+
<p>
28+
The following example attempts to filters out all <code>&lt;script&gt;</code> tags.
29+
</p>
30+
31+
<sample src="examples/BadTagFilter.rb" />
32+
33+
<p>
34+
The above sanitizer does not filter out all <code>&lt;script&gt;</code> tags.
35+
Browsers will not only accept <code>&lt;/script&gt;</code> as script end tags, but also tags such as <code>&lt;/script foo="bar"&gt;</code> even though it is a parser error.
36+
This means that an attack string such as <code>&lt;script&gt;alert(1)&lt;/script foo="bar"&gt;</code> will not be filtered by
37+
the function, and <code>alert(1)</code> will be executed by a browser if the string is rendered as HTML.
38+
</p>
39+
40+
<p>
41+
Other corner cases include that HTML comments can end with <code>--!&gt;</code>,
42+
and that HTML tag names can contain upper case characters.
43+
</p>
44+
</example>
45+
46+
<references>
47+
<li>Securitum: <a href="https://research.securitum.com/the-curious-case-of-copy-paste/">The Curious Case of Copy &amp; Paste</a>.</li>
48+
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags#answer-1732454">You can't parse [X]HTML with regex</a>.</li>
49+
<li>HTML Standard: <a href="https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state">Comment end bang state</a>.</li>
50+
<li>stackoverflow.com: <a href="https://stackoverflow.com/questions/25559999/why-arent-browsers-strict-about-html">Why aren't browsers strict about HTML?</a>.</li>
51+
</references>
52+
</qhelp>
53+
54+

0 commit comments

Comments
 (0)