Skip to content

Commit 2feb4a4

Browse files
committed
Ruby: Add hasMisleadingAnchorPrecedence to MissingRegExpAnchor
1 parent 3f8b27c commit 2feb4a4

File tree

4 files changed

+215
-21
lines changed

4 files changed

+215
-21
lines changed

ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,6 @@
6565
<code>evil.beta.example.com</code> because the regular expression is parsed
6666
as <code>/(^www\.example\.com)|(beta\.example\.com)/</code>
6767

68-
TODO: implement this part of the query
69-
7068
</p>
7169

7270
<p>

ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql

Lines changed: 151 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,158 @@ predicate isFinalRegExpTerm(RegExpTerm term) {
3434
)
3535
}
3636

37+
/** Holds if `term` is one of the transitive left children of a regexp. */
38+
predicate isLeftArmTerm(RegExpTerm term) {
39+
term.isRootTerm()
40+
or
41+
exists(RegExpTerm parent |
42+
term = parent.getChild(0) and
43+
isLeftArmTerm(parent)
44+
)
45+
}
46+
47+
/** Holds if `term` is one of the transitive right children of a regexp. */
48+
predicate isRightArmTerm(RegExpTerm term) {
49+
term.isRootTerm()
50+
or
51+
exists(RegExpTerm parent |
52+
term = parent.getLastChild() and
53+
isRightArmTerm(parent)
54+
)
55+
}
56+
57+
/**
58+
* Holds if `term` is an anchor that is not the first or last node
59+
* in its tree.
60+
*/
61+
predicate isInteriorAnchor(RegExpAnchor term) {
62+
not isLeftArmTerm(term) and
63+
not isRightArmTerm(term)
64+
}
65+
66+
/**
67+
* Holds if `term` contains an anchor that is not the first or last node
68+
* in its tree, such as `(foo|bar$|baz)`.
69+
*/
70+
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
71+
72+
/**
73+
* Holds if `term` starts with a word boundary or lookbehind assertion,
74+
* indicating that it's not intended to be anchored on that side.
75+
*/
76+
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
77+
exists(RegExpTerm child | child = term.getChild(0) |
78+
child instanceof RegExpWordBoundary or
79+
child instanceof RegExpNonWordBoundary or
80+
child instanceof RegExpLookbehind
81+
)
82+
}
83+
84+
/**
85+
* Holds if `term` ends with a word boundary or lookahead assertion,
86+
* indicating that it's not intended to be anchored on that side.
87+
*/
88+
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
89+
exists(RegExpTerm child | child = term.getLastChild() |
90+
child instanceof RegExpWordBoundary or
91+
child instanceof RegExpNonWordBoundary or
92+
child instanceof RegExpLookahead
93+
)
94+
}
95+
96+
/**
97+
* Holds if `term` is an empty sequence, usually arising from
98+
* literals with a trailing alternative such as `foo|`.
99+
*/
100+
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
101+
102+
/**
103+
* Holds if `term` contains a letter constant.
104+
*
105+
* We use this as a heuristic to filter out uninteresting results.
106+
*/
107+
predicate containsLetters(RegExpTerm term) {
108+
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
109+
}
110+
111+
/**
112+
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
113+
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
114+
*/
115+
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
116+
isAnchoredGroup(alt.getAChild()) and
117+
not alt.getAChild() instanceof RegExpGroup
118+
}
119+
120+
/**
121+
* Holds if `term` consists only of an anchor and a parenthesized term,
122+
* such as the left side of `^(foo|bar)|baz`.
123+
*
124+
* The precedence of the anchor is likely to be intentional in this case,
125+
* as the group wouldn't be needed otherwise.
126+
*/
127+
predicate isAnchoredGroup(RegExpSequence term) {
128+
term.getNumChild() = 2 and
129+
term.getAChild() instanceof RegExpAnchor and
130+
term.getAChild() instanceof RegExpGroup
131+
}
132+
133+
/**
134+
* Holds if `src` is a pattern for a collection of alternatives where
135+
* only the first or last alternative is anchored, indicating a
136+
* precedence mistake explained by `msg`.
137+
*
138+
* The canonical example of such a mistake is: `^a|b|c`, which is
139+
* parsed as `(^a)|(b)|(c)`.
140+
*/
141+
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
142+
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
143+
root = src.getRegExpTerm() and
144+
not containsInteriorAnchor(root) and
145+
not isEmpty(root.getAChild()) and
146+
not hasExplicitAnchorPrecedence(root) and
147+
containsLetters(anchoredTerm) and
148+
(
149+
anchoredTerm = root.getChild(0) and
150+
anchoredTerm.getChild(0) instanceof RegExpCaret and
151+
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
152+
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
153+
direction = "beginning"
154+
or
155+
anchoredTerm = root.getLastChild() and
156+
anchoredTerm.getLastChild() instanceof RegExpDollar and
157+
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
158+
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
159+
direction = "end"
160+
) and
161+
// that is not used for string replacement
162+
not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
163+
name = mcn.getMethodName() and
164+
arg = mcn.getArgument(0)
165+
|
166+
(
167+
src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
168+
src.getAParse() = arg
169+
) and
170+
name = ["sub", "sub!", "gsub", "gsub!"]
171+
) and
172+
msg =
173+
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
174+
"' is anchored at the " + direction +
175+
", but the other parts of this regular expression are not"
176+
)
177+
}
178+
37179
/**
38180
* Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
39181
* rather than `\A/\z` which match the start/end of the whole string.
40182
*/
41183
predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
42-
not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting
184+
// avoid double reporting
185+
not (
186+
isSemiAnchoredHostnameRegExp(src, _) or
187+
hasMisleadingAnchorPrecedence(src, _)
188+
) and
43189
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
44190
not isConstantInvalidInsideOrigin(term.getAChild*()) and
45191
tld = term.getAChild*() and
@@ -58,7 +204,7 @@ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
58204
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
59205
*/
60206
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
61-
// not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
207+
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
62208
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
63209
not isConstantInvalidInsideOrigin(term.getAChild*()) and
64210
tld = term.getAChild*() and
@@ -81,7 +227,7 @@ predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
81227
hasTopLevelDomainEnding(tld) and
82228
not isConstantInvalidInsideOrigin(term.getAChild*()) and
83229
not term.getAChild*() instanceof RegExpAnchor and
84-
// that is not used for capture or replace
230+
// that is not used for string replacement
85231
not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
86232
name = mcn.getMethodName() and
87233
arg = mcn.getArgument(0)
@@ -101,5 +247,6 @@ from DataFlow::Node nd, string msg
101247
where
102248
isUnanchoredHostnameRegExp(nd, msg) or
103249
isSemiAnchoredHostnameRegExp(nd, msg) or
104-
isLineAnchoredHostnameRegExp(nd, msg)
250+
isLineAnchoredHostnameRegExp(nd, msg) or
251+
hasMisleadingAnchorPrecedence(nd, msg)
105252
select nd, msg
Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,18 @@
1-
| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
2-
| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
3-
| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
4-
| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. |
1+
| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
2+
| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
3+
| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
4+
| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
5+
| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
6+
| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
7+
| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not |
8+
| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
9+
| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not |
10+
| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
11+
| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not |
12+
| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not |
13+
| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
14+
| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
15+
| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
16+
| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
17+
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
18+
| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
Lines changed: 46 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,53 @@
1-
/www.example.com/ # BAD
2-
/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
3-
/\Awww.example.com\z/ # GOOD
1+
/www\.example\.com/ # BAD
2+
/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
3+
/\Awww\.example\.com\z/ # GOOD
44

5-
/foo.bar/ # GOOD
5+
/foo\.bar/ # GOOD
66

7-
/https?:\/\/good.com/ # BAD
8-
/^https?:\/\/good.com/ # BAD: missing end-of-string anchor
9-
/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor
7+
/https?:\/\/good\.com/ # BAD
8+
/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor
9+
/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor
1010

1111
/bar/ # GOOD
1212

13-
foo.gsub(/www.example.com/, "bar") # GOOD
14-
foo.sub(/www.example.com/, "bar") # GOOD
15-
foo.gsub!(/www.example.com/, "bar") # GOOD
16-
foo.sub!(/www.example.com/, "bar") # GOOD
13+
foo.gsub(/www\.example\.com/, "bar") # GOOD
14+
foo.sub(/www\.example.com/, "bar") # GOOD
15+
foo.gsub!(/www\.example\.com/, "bar") # GOOD
16+
foo.sub!(/www\.example\.com/, "bar") # GOOD
1717

18+
/^a|/
19+
/^a|b/ # BAD
20+
/a|^b/
21+
/^a|^b/
22+
/^a|b|c/ # BAD
23+
/a|^b|c/
24+
/a|b|^c/
25+
/^a|^b|c/
1826

27+
/(^a)|b/
28+
/^a|(b)/ # BAD
29+
/^a|(^b)/
30+
/^(a)|(b)/ # BAD
31+
32+
33+
/a|b$/ # BAD
34+
/a$|b/
35+
/a$|b$/
36+
/a|b|c$/ # BAD
37+
/a|b$|c/
38+
/a$|b|c/
39+
/a|b$|c$/
40+
41+
/a|(b$)/
42+
/(a)|b$/ # BAD
43+
/(a$)|b$/
44+
/(a)|(b)$/ # BAD
45+
46+
/^good.com|better.com/ # BAD
47+
/^good\.com|better\.com/ # BAD
48+
/^good\\.com|better\\.com/ # BAD
49+
/^good\\\.com|better\\\.com/ # BAD
50+
/^good\\\\.com|better\\\\.com/ # BAD
51+
52+
/^foo|bar|baz$/ # BAD
53+
/^foo|%/ # OK

0 commit comments

Comments
 (0)