Ruby: copy IncompleteHostnameRegExp files from JavaScript

aibaars · aibaars · commit 832c9c4b0bda · 2022-03-07T16:10:07.000+01:00
diff --git a/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll b/ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll
@@ -0,0 +1,109 @@
+/**
+ * Provides predicates for reasoning about regular expressions
+ * that match URLs and hostname patterns.
+ */
+
+import javascript
+
+/**
+ * Holds if the given constant is unlikely to occur in the origin part of a URL.
+ */
+predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
+  // Look for any of these cases:
+  // - A character that can't occur in the origin
+  // - Two dashes in a row
+  // - A colon that is not part of port or scheme separator
+  // - A slash that is not part of scheme separator
+  term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
+}
+
+/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
+predicate isDotConstant(RegExpTerm term) {
+  term.(RegExpCharEscape).getValue() = "."
+  or
+  exists(RegExpCharacterClass cls |
+    term = cls and
+    not cls.isInverted() and
+    cls.getNumChild() = 1 and
+    cls.getAChild().(RegExpConstant).getValue() = "."
+  )
+}
+
+/** Holds if `term` is a wildcard `.` or an actual `.` character. */
+predicate isDotLike(RegExpTerm term) {
+  term instanceof RegExpDot
+  or
+  isDotConstant(term)
+}
+
+/** Holds if `term` will only ever be matched against the beginning of the input. */
+predicate matchesBeginningOfString(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpTerm parent | matchesBeginningOfString(parent) |
+    term = parent.(RegExpSequence).getChild(0)
+    or
+    parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
+    term = parent.(RegExpSequence).getChild(1)
+    or
+    term = parent.(RegExpAlt).getAChild()
+    or
+    term = parent.(RegExpGroup).getAChild()
+  )
+}
+
+/**
+ * Holds if the given sequence contains top-level domain preceded by a dot, such as `.com`,
+ * excluding cases where this is at the very beginning of the regexp.
+ *
+ * `i` is bound to the index of the last child in the top-level domain part.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
+  seq.getChild(i)
+      .(RegExpConstant)
+      .getValue()
+      .regexpMatch("(?i)" + RegExpPatterns::commonTLD() + "(:\\d+)?([/?#].*)?") and
+  isDotLike(seq.getChild(i - 1)) and
+  not (i = 1 and matchesBeginningOfString(seq))
+}
+
+/**
+ * Holds if the given regular expression term contains top-level domain preceded by a dot,
+ * such as `.com`.
+ */
+predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
+
+/**
+ * Holds if `term` will always match a hostname, that is, all disjunctions contain
+ * a hostname pattern that isn't inside a quantifier.
+ */
+predicate alwaysMatchesHostname(RegExpTerm term) {
+  hasTopLevelDomainEnding(term, _)
+  or
+  // `localhost` is considered a hostname pattern, but has no TLD
+  term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
+  or
+  not term instanceof RegExpAlt and
+  not term instanceof RegExpQuantifier and
+  alwaysMatchesHostname(term.getAChild())
+  or
+  alwaysMatchesHostnameAlt(term)
+}
+
+/** Holds if every child of `alt` contains a hostname pattern. */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
+  alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
+}
+
+/**
+ * Holds if the first `i` children of `alt` contains a hostname pattern.
+ *
+ * This is used instead of `forall` to avoid materializing the set of alternatives
+ * that don't contains hostnames, which is much larger.
+ */
+predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
+  alwaysMatchesHostname(alt.getChild(0)) and i = 0
+  or
+  alwaysMatchesHostnameAlt(alt, i - 1) and
+  alwaysMatchesHostname(alt.getChild(i))
+}
diff --git a/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.qhelp b/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.qhelp
@@ -0,0 +1,73 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted URLs is an important technique for
+			preventing attacks such as request forgeries and malicious
+			redirections. Often, this is done by checking that the host of a URL
+			is in a set of allowed hosts.
+
+		</p>
+
+		<p>
+
+			If a regular expression implements such a check, it is
+			easy to accidentally make the check too permissive by not escaping the
+			<code>.</code> meta-characters appropriately.
+
+			Even if the check is not used in a security-critical
+			context, the incomplete check may still cause undesirable behaviors
+			when it accidentally succeeds.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Escape all meta-characters appropriately when constructing
+			regular expressions for security checks, pay special attention to the
+			<code>.</code> meta-character.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains.
+
+		</p>
+
+		<sample src="examples/IncompleteHostnameRegExp.js"/>
+
+		<p>
+
+			The check is however easy to bypass because the unescaped
+			<code>.</code> allows for any character before
+			<code>example.com</code>, effectively allowing the redirect to go to
+			an attacker-controlled domain such as <code>wwwXexample.com</code>.
+
+		</p>
+		<p>
+
+			Address this vulnerability by escaping <code>.</code>
+			appropriately: <code>let regex = /((www|beta)\.)?example\.com/</code>.
+
+		</p>
+
+	</example>
+
+	<references>
+		<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">Regular Expressions</a></li>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
diff --git a/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql b/ruby/ql/src/queries/security/cwe-020/IncompleteHostnameRegExp.ql
@@ -0,0 +1,107 @@
+/**
+ * @name Incomplete regular expression for hostnames
+ * @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision high
+ * @id js/incomplete-hostname-regexp
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-020
+ */
+
+import javascript
+import semmle.javascript.CharacterEscapes
+import HostnameRegexpShared
+
+/**
+ * Holds if `term` occurs inside a quantifier or alternative (and thus
+ * can not be expected to correspond to a unique match), or as part of
+ * a lookaround assertion (which are rarely used for capture groups).
+ */
+predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
+  exists(RegExpParent parent | parent = term.getParent() |
+    parent instanceof RegExpAlt
+    or
+    parent instanceof RegExpQuantifier
+    or
+    parent instanceof RegExpSubPattern
+    or
+    isInsideChoiceOrSubPattern(parent)
+  )
+}
+
+/**
+ * Holds if `group` is likely to be used as a capture group.
+ */
+predicate isLikelyCaptureGroup(RegExpGroup group) {
+  group.isCapture() and
+  not isInsideChoiceOrSubPattern(group)
+}
+
+/**
+ * Holds if `seq` contains two consecutive dots `..` or escaped dots.
+ *
+ * At least one of these dots is not intended to be a subdomain separator,
+ * so we avoid flagging the pattern in this case.
+ */
+predicate hasConsecutiveDots(RegExpSequence seq) {
+  exists(int i |
+    isDotLike(seq.getChild(i)) and
+    isDotLike(seq.getChild(i + 1))
+  )
+}
+
+predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
+  seq = regexp.getAChild*() and
+  exists(RegExpDot unescapedDot, int i, string hostname |
+    hasTopLevelDomainEnding(seq, i) and
+    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
+    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
+    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
+    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
+    not hasConsecutiveDots(unescapedDot.getParent()) and
+    hostname =
+      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
+        seq.getChild(i).getRawValue()
+  |
+    if unescapedDot.getParent() instanceof RegExpQuantifier
+    then
+      // `.*\.example.com` can match `evil.com/?x=.example.com`
+      //
+      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
+      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
+      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
+      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
+      seq.getChild(0) instanceof RegExpCaret and
+      not seq.getAChild() instanceof RegExpDollar and
+      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
+      msg =
+        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
+          + "' which may cause '" + hostname +
+          "' to be matched anywhere in the URL, outside the hostname."
+    else
+      msg =
+        "has an unescaped '.' before '" + hostname +
+          "', so it might match more hosts than expected."
+  )
+}
+
+from
+  RegExpPatternSource re, RegExpTerm regexp, RegExpSequence hostSequence, string msg, string kind,
+  DataFlow::Node aux
+where
+  regexp = re.getRegExpTerm() and
+  isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
+  (
+    if re.getAParse() != re
+    then (
+      kind = "string, which is used as a regular expression $@," and
+      aux = re.getAParse()
+    ) else (
+      kind = "regular expression" and aux = re
+    )
+  ) and
+  not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
+select hostSequence, "This " + kind + " " + msg, aux, "here"
diff --git a/ruby/ql/src/queries/security/cwe-020/examples/IncompleteHostnameRegExp.rb b/ruby/ql/src/queries/security/cwe-020/examples/IncompleteHostnameRegExp.rb
@@ -0,0 +1,9 @@
+app.get('/some/path', function(req, res) {
+    let url = req.param('url'),
+        host = urlLib.parse(url).host;
+    // BAD: the host of `url` may be controlled by an attacker
+    let regex = /^((www|beta).)?example.com/;
+    if (host.match(regex)) {
+        res.redirect(url);
+    }
+});
diff --git a/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/IncompleteHostnameRegExp.expected b/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/IncompleteHostnameRegExp.expected
@@ -0,0 +1,26 @@
+| tst-IncompleteHostnameRegExp.js:3:3:3:28 | ^http:\\/\\/test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:3:2:3:29 | /^http: ... le.com/ | here |
+| tst-IncompleteHostnameRegExp.js:5:3:5:28 | ^http:\\/\\/test.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:5:2:5:29 | /^http: ... le.net/ | here |
+| tst-IncompleteHostnameRegExp.js:6:3:6:42 | ^http:\\/\\/test.(example-a\|example-b).com | This regular expression has an unescaped '.' before '(example-a\|example-b).com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:6:2:6:43 | /^http: ... b).com/ | here |
+| tst-IncompleteHostnameRegExp.js:7:3:7:30 | ^http:\\/\\/(.+).example.com\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:7:2:7:31 | /^http: ... .com\\// | here |
+| tst-IncompleteHostnameRegExp.js:7:3:7:30 | ^http:\\/\\/(.+).example.com\\/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example.com' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:7:2:7:31 | /^http: ... .com\\// | here |
+| tst-IncompleteHostnameRegExp.js:10:3:10:36 | ^http:\\/\\/test.example.com\\/(?:.*) | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:10:2:10:37 | /^http: ... (?:.*)/ | here |
+| tst-IncompleteHostnameRegExp.js:11:14:11:37 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:11:13:11:38 | "^http: ... le.com" | here |
+| tst-IncompleteHostnameRegExp.js:12:15:12:38 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:12:14:12:39 | "^http: ... le.com" | here |
+| tst-IncompleteHostnameRegExp.js:15:23:15:46 | ^http://test.example.com | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:15:13:15:50 | id(id(i ... com"))) | here |
+| tst-IncompleteHostnameRegExp.js:19:18:19:34 | ^test.example.com | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:20:13:20:26 | `${hostname}$` | here |
+| tst-IncompleteHostnameRegExp.js:22:28:22:44 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:23:13:23:27 | domain.hostname | here |
+| tst-IncompleteHostnameRegExp.js:28:24:28:40 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:26:21:26:35 | domain.hostname | here |
+| tst-IncompleteHostnameRegExp.js:30:31:30:47 | test.example.com$ | This string, which is used as a regular expression $@, has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:32:21:32:35 | domain.hostname | here |
+| tst-IncompleteHostnameRegExp.js:37:3:37:53 | ^(https?:)?\\/\\/((service\|www).)?example.com(?=$\|\\/) | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:37:2:37:54 | /^(http ... =$\|\\/)/ | here |
+| tst-IncompleteHostnameRegExp.js:38:3:38:43 | ^(http\|https):\\/\\/www.example.com\\/p\\/f\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:38:2:38:44 | /^(http ... p\\/f\\// | here |
+| tst-IncompleteHostnameRegExp.js:39:5:39:30 | http:\\/\\/sub.example.com\\/ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:39:2:39:33 | /^(http ... om\\/)/g | here |
+| tst-IncompleteHostnameRegExp.js:40:3:40:29 | ^https?:\\/\\/api.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:40:2:40:30 | /^https ... le.com/ | here |
+| tst-IncompleteHostnameRegExp.js:41:42:41:48 | ^https?://.+\\.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:41:13:41:71 | '^http: ... \\.com/' | here |
+| tst-IncompleteHostnameRegExp.js:43:3:43:32 | ^https:\\/\\/[a-z]*.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:43:2:43:33 | /^https ... e.com$/ | here |
+| tst-IncompleteHostnameRegExp.js:44:32:44:45 | .+.example.net | This regular expression has an unescaped '.' before 'example.net', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
+| tst-IncompleteHostnameRegExp.js:44:47:44:62 | .+.example-a.com | This regular expression has an unescaped '.' before 'example-a.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
+| tst-IncompleteHostnameRegExp.js:44:64:44:79 | .+.example-b.com | This regular expression has an unescaped '.' before 'example-b.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:44:9:44:101 | '^proto ... ernal)' | here |
+| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
+| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
+| tst-IncompleteHostnameRegExp.js:53:14:53:35 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
+| tst-IncompleteHostnameRegExp.js:59:5:59:20 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:59:2:59:32 | /^(foo. ... ever)$/ | here |
diff --git a/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/IncompleteHostnameRegExp.qlref b/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/IncompleteHostnameRegExp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteHostnameRegExp.ql
diff --git a/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/hosttest.rb b/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/hosttest.rb
@@ -0,0 +1,23 @@
+from flask import Flask, request, redirect
+import re
+
+app = Flask(__name__)
+
+UNSAFE_REGEX = re.compile("(www|beta).example.com/")
+SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
+
+@app.route('/some/path/bad')
+def unsafe(request):
+    target = request.args.get('target', '')
+    if UNSAFE_REGEX.match(target):
+        return redirect(target)
+
+@app.route('/some/path/good')
+def safe(request):
+    target = request.args.get('target', '')
+    if SAFE_REGEX.match(target):
+        return redirect(target)
+
+# FP reported in https://github.com/github/codeql/issues/3712
+# This does not define a regex (but could be used by other code to do so)
+escaped = re.escape("https://www.humblebundle.com/home/library")
diff --git a/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/tst-IncompleteHostnameRegExp.rb b/ruby/ql/test/query-tests/security/cwe-020/IncompleteHostnameRegExp/tst-IncompleteHostnameRegExp.rb

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Security/CWE-020/IncompleteHostnameRegExp.ql`