Skip to content

Commit 832c9c4

Browse files
committed
Ruby: copy IncompleteHostnameRegExp files from JavaScript
1 parent 602538d commit 832c9c4

File tree

8 files changed

+408
-0
lines changed

8 files changed

+408
-0
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/**
2+
* Provides predicates for reasoning about regular expressions
3+
* that match URLs and hostname patterns.
4+
*/
5+
6+
import javascript
7+
8+
/**
9+
* Holds if the given constant is unlikely to occur in the origin part of a URL.
10+
*/
11+
predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
12+
// Look for any of these cases:
13+
// - A character that can't occur in the origin
14+
// - Two dashes in a row
15+
// - A colon that is not part of port or scheme separator
16+
// - A slash that is not part of scheme separator
17+
term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
18+
}
19+
20+
/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
21+
predicate isDotConstant(RegExpTerm term) {
22+
term.(RegExpCharEscape).getValue() = "."
23+
or
24+
exists(RegExpCharacterClass cls |
25+
term = cls and
26+
not cls.isInverted() and
27+
cls.getNumChild() = 1 and
28+
cls.getAChild().(RegExpConstant).getValue() = "."
29+
)
30+
}
31+
32+
/** Holds if `term` is a wildcard `.` or an actual `.` character. */
33+
predicate isDotLike(RegExpTerm term) {
34+
term instanceof RegExpDot
35+
or
36+
isDotConstant(term)
37+
}
38+
39+
/** Holds if `term` will only ever be matched against the beginning of the input. */
40+
predicate matchesBeginningOfString(RegExpTerm term) {
41+
term.isRootTerm()
42+
or
43+
exists(RegExpTerm parent | matchesBeginningOfString(parent) |
44+
term = parent.(RegExpSequence).getChild(0)
45+
or
46+
parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
47+
term = parent.(RegExpSequence).getChild(1)
48+
or
49+
term = parent.(RegExpAlt).getAChild()
50+
or
51+
term = parent.(RegExpGroup).getAChild()
52+
)
53+
}
54+
55+
/**
56+
* Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
57+
* excluding cases where this is at the very beginning of the regexp.
58+
*
59+
* `i` is bound to the index of the last child in the top-level domain part.
60+
*/
61+
predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
62+
seq.getChild(i)
63+
.(RegExpConstant)
64+
.getValue()
65+
.regexpMatch("(?i)" + RegExpPatterns::commonTLD() + "(:\\d+)?([/?#].*)?") and
66+
isDotLike(seq.getChild(i - 1)) and
67+
not (i = 1 and matchesBeginningOfString(seq))
68+
}
69+
70+
/**
71+
* Holds if the given regular expression term contains top-level domain preceded by a dot,
72+
* such as `.com`.
73+
*/
74+
predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
75+
76+
/**
77+
* Holds if `term` will always match a hostname, that is, all disjunctions contain
78+
* a hostname pattern that isn't inside a quantifier.
79+
*/
80+
predicate alwaysMatchesHostname(RegExpTerm term) {
81+
hasTopLevelDomainEnding(term, _)
82+
or
83+
// `localhost` is considered a hostname pattern, but has no TLD
84+
term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
85+
or
86+
not term instanceof RegExpAlt and
87+
not term instanceof RegExpQuantifier and
88+
alwaysMatchesHostname(term.getAChild())
89+
or
90+
alwaysMatchesHostnameAlt(term)
91+
}
92+
93+
/** Holds if every child of `alt` contains a hostname pattern. */
94+
predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
95+
alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
96+
}
97+
98+
/**
99+
* Holds if the first `i` children of `alt` contains a hostname pattern.
100+
*
101+
* This is used instead of `forall` to avoid materializing the set of alternatives
102+
* that don't contains hostnames, which is much larger.
103+
*/
104+
predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
105+
alwaysMatchesHostname(alt.getChild(0)) and i = 0
106+
or
107+
alwaysMatchesHostnameAlt(alt, i - 1) and
108+
alwaysMatchesHostname(alt.getChild(i))
109+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
9+
Sanitizing untrusted URLs is an important technique for
10+
preventing attacks such as request forgeries and malicious
11+
redirections. Often, this is done by checking that the host of a URL
12+
is in a set of allowed hosts.
13+
14+
</p>
15+
16+
<p>
17+
18+
If a regular expression implements such a check, it is
19+
easy to accidentally make the check too permissive by not escaping the
20+
<code>.</code> meta-characters appropriately.
21+
22+
Even if the check is not used in a security-critical
23+
context, the incomplete check may still cause undesirable behaviors
24+
when it accidentally succeeds.
25+
26+
</p>
27+
</overview>
28+
29+
<recommendation>
30+
<p>
31+
32+
Escape all meta-characters appropriately when constructing
33+
regular expressions for security checks, pay special attention to the
34+
<code>.</code> meta-character.
35+
36+
</p>
37+
</recommendation>
38+
39+
<example>
40+
41+
<p>
42+
43+
The following example code checks that a URL redirection
44+
will reach the <code>example.com</code> domain, or one of its
45+
subdomains.
46+
47+
</p>
48+
49+
<sample src="examples/IncompleteHostnameRegExp.js"/>
50+
51+
<p>
52+
53+
The check is however easy to bypass because the unescaped
54+
<code>.</code> allows for any character before
55+
<code>example.com</code>, effectively allowing the redirect to go to
56+
an attacker-controlled domain such as <code>wwwXexample.com</code>.
57+
58+
</p>
59+
<p>
60+
61+
Address this vulnerability by escaping <code>.</code>
62+
appropriately: <code>let regex = /((www|beta)\.)?example\.com/</code>.
63+
64+
</p>
65+
66+
</example>
67+
68+
<references>
69+
<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">Regular Expressions</a></li>
70+
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
71+
<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
72+
</references>
73+
</qhelp>
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/**
2+
* @name Incomplete regular expression for hostnames
3+
* @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
4+
* @kind problem
5+
* @problem.severity warning
6+
* @security-severity 7.8
7+
* @precision high
8+
* @id js/incomplete-hostname-regexp
9+
* @tags correctness
10+
* security
11+
* external/cwe/cwe-020
12+
*/
13+
14+
import javascript
15+
import semmle.javascript.CharacterEscapes
16+
import HostnameRegexpShared
17+
18+
/**
19+
* Holds if `term` occurs inside a quantifier or alternative (and thus
20+
* can not be expected to correspond to a unique match), or as part of
21+
* a lookaround assertion (which are rarely used for capture groups).
22+
*/
23+
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
24+
exists(RegExpParent parent | parent = term.getParent() |
25+
parent instanceof RegExpAlt
26+
or
27+
parent instanceof RegExpQuantifier
28+
or
29+
parent instanceof RegExpSubPattern
30+
or
31+
isInsideChoiceOrSubPattern(parent)
32+
)
33+
}
34+
35+
/**
36+
* Holds if `group` is likely to be used as a capture group.
37+
*/
38+
predicate isLikelyCaptureGroup(RegExpGroup group) {
39+
group.isCapture() and
40+
not isInsideChoiceOrSubPattern(group)
41+
}
42+
43+
/**
44+
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
45+
*
46+
* At least one of these dots is not intended to be a subdomain separator,
47+
* so we avoid flagging the pattern in this case.
48+
*/
49+
predicate hasConsecutiveDots(RegExpSequence seq) {
50+
exists(int i |
51+
isDotLike(seq.getChild(i)) and
52+
isDotLike(seq.getChild(i + 1))
53+
)
54+
}
55+
56+
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
57+
seq = regexp.getAChild*() and
58+
exists(RegExpDot unescapedDot, int i, string hostname |
59+
hasTopLevelDomainEnding(seq, i) and
60+
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
61+
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
62+
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
63+
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
64+
not hasConsecutiveDots(unescapedDot.getParent()) and
65+
hostname =
66+
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
67+
seq.getChild(i).getRawValue()
68+
|
69+
if unescapedDot.getParent() instanceof RegExpQuantifier
70+
then
71+
// `.*\.example.com` can match `evil.com/?x=.example.com`
72+
//
73+
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
74+
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
75+
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
76+
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
77+
seq.getChild(0) instanceof RegExpCaret and
78+
not seq.getAChild() instanceof RegExpDollar and
79+
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
80+
msg =
81+
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
82+
+ "' which may cause '" + hostname +
83+
"' to be matched anywhere in the URL, outside the hostname."
84+
else
85+
msg =
86+
"has an unescaped '.' before '" + hostname +
87+
"', so it might match more hosts than expected."
88+
)
89+
}
90+
91+
from
92+
RegExpPatternSource re, RegExpTerm regexp, RegExpSequence hostSequence, string msg, string kind,
93+
DataFlow::Node aux
94+
where
95+
regexp = re.getRegExpTerm() and
96+
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
97+
(
98+
if re.getAParse() != re
99+
then (
100+
kind = "string, which is used as a regular expression $@," and
101+
aux = re.getAParse()
102+
) else (
103+
kind = "regular expression" and aux = re
104+
)
105+
) and
106+
not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
107+
select hostSequence, "This " + kind + " " + msg, aux, "here"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
app.get('/some/path', function(req, res) {
2+
let url = req.param('url'),
3+
host = urlLib.parse(url).host;
4+
// BAD: the host of `url` may be controlled by an attacker
5+
let regex = /^((www|beta).)?example.com/;
6+
if (host.match(regex)) {
7+
res.redirect(url);
8+
}
9+
});
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
| tst-IncompleteHostnameRegExp.js:3:3:3:28 | ^http:\\/\\/test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:3:2:3:29 | /^http: ... le.com/ | here |
2+
| tst-IncompleteHostnameRegExp.js:5:3:5:28 | ^http:\\/\\/test.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:5:2:5:29 | /^http: ... le.net/ | here |
3+
| tst-IncompleteHostnameRegExp.js:6:3:6:42 | ^http:\\/\\/test.(example-a\|example-b).com | This regular expression has an unescaped '.' before '(example-a\|example-b).com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:6:2:6:43 | /^http: ... b).com/ | here |
4+
| tst-IncompleteHostnameRegExp.js:7:3:7:30 | ^http:\\/\\/(.+).example.com\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:7:2:7:31 | /^http: ... .com\\// | here |
5+
| tst-IncompleteHostnameRegExp.js:7:3:7:30 | ^http:\\/\\/(.+).example.com\\/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example.com' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:7:2:7:31 | /^http: ... .com\\// | here |
6+
| tst-IncompleteHostnameRegExp.js:10:3:10:36 | ^http:\\/\\/test.example.com\\/(?:.*) | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:10:2:10:37 | /^http: ... (?:.*)/ | here |
7+
| tst-IncompleteHostnameRegExp.js:11:14:11:37 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:11:13:11:38 | "^http: ... le.com" | here |
8+
| tst-IncompleteHostnameRegExp.js:12:15:12:38 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:12:14:12:39 | "^http: ... le.com" | here |
9+
| tst-IncompleteHostnameRegExp.js:15:23:15:46 | ^http://test.example.com | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:15:13:15:50 | id(id(i ... com"))) | here |
10+
| tst-IncompleteHostnameRegExp.js:19:18:19:34 | ^test.example.com | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:20:13:20:26 | `${hostname}$` | here |
11+
| tst-IncompleteHostnameRegExp.js:22:28:22:44 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:23:13:23:27 | domain.hostname | here |
12+
| tst-IncompleteHostnameRegExp.js:28:24:28:40 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:26:21:26:35 | domain.hostname | here |
13+
| tst-IncompleteHostnameRegExp.js:30:31:30:47 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:32:21:32:35 | domain.hostname | here |
14+
| tst-IncompleteHostnameRegExp.js:37:3:37:53 | ^(https?:)?\\/\\/((service\|www).)?example.com(?=$\|\\/) | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:37:2:37:54 | /^(http ... =$\|\\/)/ | here |
15+
| tst-IncompleteHostnameRegExp.js:38:3:38:43 | ^(http\|https):\\/\\/www.example.com\\/p\\/f\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | here |
16+
| tst-IncompleteHostnameRegExp.js:39:5:39:30 | http:\\/\\/sub.example.com\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:39:2:39:33 | /^(http ... om\\/)/g | here |
17+
| tst-IncompleteHostnameRegExp.js:40:3:40:29 | ^https?:\\/\\/api.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:40:2:40:30 | /^https ... le.com/ | here |
18+
| tst-IncompleteHostnameRegExp.js:41:42:41:48 | ^https?://.+\\.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:41:13:41:71 | '^http: ... \\.com/' | here |
19+
| tst-IncompleteHostnameRegExp.js:43:3:43:32 | ^https:\\/\\/[a-z]*.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | here |
20+
| tst-IncompleteHostnameRegExp.js:44:32:44:45 | .+.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
21+
| tst-IncompleteHostnameRegExp.js:44:47:44:62 | .+.example-a.com | This regular expression has an unescaped '.' before 'example-a.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
22+
| tst-IncompleteHostnameRegExp.js:44:64:44:79 | .+.example-b.com | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
23+
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
24+
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
25+
| tst-IncompleteHostnameRegExp.js:53:14:53:35 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
26+
| tst-IncompleteHostnameRegExp.js:59:5:59:20 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:59:2:59:32 | /^(foo. ... ever)$/ | here |
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Security/CWE-020/IncompleteHostnameRegExp.ql
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from flask import Flask, request, redirect
2+
import re
3+
4+
app = Flask(__name__)
5+
6+
UNSAFE_REGEX = re.compile("(www|beta).example.com/")
7+
SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
8+
9+
@app.route('/some/path/bad')
10+
def unsafe(request):
11+
target = request.args.get('target', '')
12+
if UNSAFE_REGEX.match(target):
13+
return redirect(target)
14+
15+
@app.route('/some/path/good')
16+
def safe(request):
17+
target = request.args.get('target', '')
18+
if SAFE_REGEX.match(target):
19+
return redirect(target)
20+
21+
# FP reported in https://github.com/github/codeql/issues/3712
22+
# This does not define a regex (but could be used by other code to do so)
23+
escaped = re.escape("https://www.humblebundle.com/home/library")

0 commit comments

Comments
 (0)