Ruby: Add MissingRegExpAnchor query

hmac · hmac · commit e3c3c00c6844 · 2022-04-27T10:12:33.000+12:00
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -0,0 +1,85 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+	<overview>
+		<p>
+
+			Sanitizing untrusted input with regular expressions is a
+			common technique.  However, it is error-prone to match untrusted input
+			against regular expressions without anchors such as <code>^</code> or
+			<code>$</code>.  Malicious input can bypass such security checks by
+			embedding one of the allowed patterns in an unexpected location.
+
+		</p>
+
+		<p>
+
+			Even if the matching is not done in a security-critical
+			context, it may still cause undesirable behavior when the regular
+			expression accidentally matches.
+
+		</p>
+	</overview>
+
+	<recommendation>
+		<p>
+
+			Use anchors to ensure that regular expressions match at
+			the expected locations.
+
+		</p>
+	</recommendation>
+
+	<example>
+
+		<p>
+
+			The following example code checks that a URL redirection
+			will reach the <code>example.com</code> domain, or one of its
+			subdomains, and not some malicious site.
+
+		</p>
+
+		<sample src="examples/missing_regexp_anchor_bad.rb"/>
+
+		<p>
+
+			The check with the regular expression match is, however, easy to bypass. For example
+			by embedding <code>http://example.com/</code> in the query
+			string component: <code>http://evil-example.net/?x=http://example.com/</code>.
+
+			Address these shortcomings by using anchors in the regular expression instead:
+
+		</p>
+
+		<sample src="examples/missing_regexp_anchor_good.rb"/>
+
+		<p>
+
+			A related mistake is to write a regular expression with
+			multiple alternatives, but to only include an anchor for one of the
+			alternatives. As an example, the regular expression
+			<code>/^www\.example\.com|beta\.example\.com/</code> will match the host
+			<code>evil.beta.example.com</code> because the regular expression is parsed
+			as <code>/(^www\.example\.com)|(beta\.example\.com)/</code>
+
+			TODO: implement this part of the query
+
+		</p>
+
+		<p>
+
+			TODO: describe the danger of using line anchors like <code>^</code>
+			or <code>$</code>.
+
+		</p>
+
+	</example>
+
+	<references>
+		<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+		<li>OWASP: <a href="https://cheatsheetseries.owasp.org/cheatsheets/Unvalidated_Redirects_and_Forwards_Cheat_Sheet.html">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+	</references>
+</qhelp>
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -0,0 +1,105 @@
+/**
+ * @name Missing regular expression anchor
+ * @description Regular expressions without anchors can be vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @security-severity 7.8
+ * @precision medium
+ * @id rb/regex/missing-regexp-anchor
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-020
+ */
+
+import HostnameRegexpShared
+import codeql.ruby.DataFlow
+import codeql.ruby.security.performance.RegExpTreeView
+
+/**
+ * Holds if `term` is a final term, that is, no term will match anything after this one.
+ */
+predicate isFinalRegExpTerm(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpSequence seq |
+    isFinalRegExpTerm(seq) and
+    term = seq.getLastChild()
+  )
+  or
+  exists(RegExpTerm parent |
+    isFinalRegExpTerm(parent) and
+    term = parent.getAChild() and
+    not parent instanceof RegExpSequence and
+    not parent instanceof RegExpQuantifier
+  )
+}
+
+/**
+ * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
+ * rather than `\A/\z` which match the start/end of the whole string.
+ */
+predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+  not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting
+  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+    not isConstantInvalidInsideOrigin(term.getAChild*()) and
+    tld = term.getAChild*() and
+    hasTopLevelDomainEnding(tld, i) and
+    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+    (
+      tld.getChild(0).(RegExpCaret).getChar() = "^" or
+      tld.getLastChild().(RegExpDollar).getChar() = "$"
+    ) and
+    msg =
+      "This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead."
+  )
+}
+
+/**
+ * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
+ */
+predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+  // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
+  exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
+    not isConstantInvalidInsideOrigin(term.getAChild*()) and
+    tld = term.getAChild*() and
+    hasTopLevelDomainEnding(tld, i) and
+    isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
+    tld.getChild(0) instanceof RegExpCaret and
+    msg =
+      "This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end."
+  )
+}
+
+/**
+ * Holds if `src` is an unanchored pattern for a URL, indicating a
+ * mistake explained by `msg`.
+ */
+predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
+  exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
+    alwaysMatchesHostname(term) and
+    tld = term.getAChild*() and
+    hasTopLevelDomainEnding(tld) and
+    not isConstantInvalidInsideOrigin(term.getAChild*()) and
+    not term.getAChild*() instanceof RegExpAnchor and
+    // that is not used for capture or replace
+    not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
+      name = mcn.getMethodName() and
+      arg = mcn.getArgument(0)
+    |
+      (
+        src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
+        src.getAParse() = arg
+      ) and
+      name = ["sub", "sub!", "gsub", "gsub!"]
+    ) and
+    msg =
+      "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
+  )
+}
+
+from DataFlow::Node nd, string msg
+where
+  isUnanchoredHostnameRegExp(nd, msg) or
+  isSemiAnchoredHostnameRegExp(nd, msg) or
+  isLineAnchoredHostnameRegExp(nd, msg)
+select nd, msg
diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_bad.rb
@@ -0,0 +1,8 @@
+class UsersController < ActionController::Base
+    def index
+        # BAD: the host of `params[:url]` may be controlled by an attacker
+        if params[:url].match? /https?:\/\/www\.example\.com\//
+            redirect_to params[:url]
+        end
+    end
+end
diff --git a/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb b/ruby/ql/src/queries/security/cwe-020/examples/missing_regexp_anchor_good.rb
@@ -0,0 +1,8 @@
+class UsersController < ActionController::Base
+    def index
+        # GOOD: the host of `params[:url]` can not be controlled by an attacker
+        if params[:url].match? /\Ahttps?:\/\/www\.example\.com\//
+            redirect_to params[:url]
+        end
+    end
+end
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
@@ -0,0 +1,4 @@
+| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
+| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. |
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.qlref
@@ -0,0 +1 @@
+queries/security/cwe-020/MissingRegExpAnchor.ql
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
@@ -0,0 +1,18 @@
+/www.example.com/ # BAD
+/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
+/\Awww.example.com\z/ # GOOD
+
+/foo.bar/ # GOOD
+
+/https?:\/\/good.com/ # BAD
+/^https?:\/\/good.com/ # BAD: missing end-of-string anchor
+/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor
+
+/bar/ # GOOD
+
+foo.gsub(/www.example.com/, "bar") # GOOD
+foo.sub(/www.example.com/, "bar") # GOOD
+foo.gsub!(/www.example.com/, "bar") # GOOD
+foo.sub!(/www.example.com/, "bar") # GOOD
+
+

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+queries/security/cwe-020/MissingRegExpAnchor.ql`