Ruby: Add hasMisleadingAnchorPrecedence to MissingRegExpAnchor

hmac · hmac · commit 2feb4a48beb8 · 2022-04-27T10:12:33.000+12:00
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.qhelp
@@ -65,8 +65,6 @@
 			<code>evil.beta.example.com</code> because the regular expression is parsed
 			as <code>/(^www\.example\.com)|(beta\.example\.com)/</code>
 
-			TODO: implement this part of the query
-
 		</p>
 
 		<p>
diff --git a/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql b/ruby/ql/src/queries/security/cwe-020/MissingRegExpAnchor.ql
@@ -34,12 +34,158 @@ predicate isFinalRegExpTerm(RegExpTerm term) {
   )
 }
 
+/** Holds if `term` is one of the transitive left children of a regexp. */
+predicate isLeftArmTerm(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpTerm parent |
+    term = parent.getChild(0) and
+    isLeftArmTerm(parent)
+  )
+}
+
+/** Holds if `term` is one of the transitive right children of a regexp. */
+predicate isRightArmTerm(RegExpTerm term) {
+  term.isRootTerm()
+  or
+  exists(RegExpTerm parent |
+    term = parent.getLastChild() and
+    isRightArmTerm(parent)
+  )
+}
+
+/**
+ * Holds if `term` is an anchor that is not the first or last node
+ * in its tree.
+ */
+predicate isInteriorAnchor(RegExpAnchor term) {
+  not isLeftArmTerm(term) and
+  not isRightArmTerm(term)
+}
+
+/**
+ * Holds if `term` contains an anchor that is not the first or last node
+ * in its tree, such as `(foo|bar$|baz)`.
+ */
+predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
+
+/**
+ * Holds if `term` starts with a word boundary or lookbehind assertion,
+ * indicating that it's not intended to be anchored on that side.
+ */
+predicate containsLeadingPseudoAnchor(RegExpSequence term) {
+  exists(RegExpTerm child | child = term.getChild(0) |
+    child instanceof RegExpWordBoundary or
+    child instanceof RegExpNonWordBoundary or
+    child instanceof RegExpLookbehind
+  )
+}
+
+/**
+ * Holds if `term` ends with a word boundary or lookahead assertion,
+ * indicating that it's not intended to be anchored on that side.
+ */
+predicate containsTrailingPseudoAnchor(RegExpSequence term) {
+  exists(RegExpTerm child | child = term.getLastChild() |
+    child instanceof RegExpWordBoundary or
+    child instanceof RegExpNonWordBoundary or
+    child instanceof RegExpLookahead
+  )
+}
+
+/**
+ * Holds if `term` is an empty sequence, usually arising from
+ * literals with a trailing alternative such as `foo|`.
+ */
+predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
+
+/**
+ * Holds if `term` contains a letter constant.
+ *
+ * We use this as a heuristic to filter out uninteresting results.
+ */
+predicate containsLetters(RegExpTerm term) {
+  term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
+}
+
+/**
+ * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
+ * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
+ */
+predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
+  isAnchoredGroup(alt.getAChild()) and
+  not alt.getAChild() instanceof RegExpGroup
+}
+
+/**
+ * Holds if `term` consists only of an anchor and a parenthesized term,
+ * such as the left side of `^(foo|bar)|baz`.
+ *
+ * The precedence of the anchor is likely to be intentional in this case,
+ * as the group wouldn't be needed otherwise.
+ */
+predicate isAnchoredGroup(RegExpSequence term) {
+  term.getNumChild() = 2 and
+  term.getAChild() instanceof RegExpAnchor and
+  term.getAChild() instanceof RegExpGroup
+}
+
+/**
+ * Holds if `src` is a pattern for a collection of alternatives where
+ * only the first or last alternative is anchored, indicating a
+ * precedence mistake explained by `msg`.
+ *
+ * The canonical example of such a mistake is: `^a|b|c`, which is
+ * parsed as `(^a)|(b)|(c)`.
+ */
+predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
+  exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
+    root = src.getRegExpTerm() and
+    not containsInteriorAnchor(root) and
+    not isEmpty(root.getAChild()) and
+    not hasExplicitAnchorPrecedence(root) and
+    containsLetters(anchoredTerm) and
+    (
+      anchoredTerm = root.getChild(0) and
+      anchoredTerm.getChild(0) instanceof RegExpCaret and
+      not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
+      containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
+      direction = "beginning"
+      or
+      anchoredTerm = root.getLastChild() and
+      anchoredTerm.getLastChild() instanceof RegExpDollar and
+      not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
+      containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
+      direction = "end"
+    ) and
+    // that is not used for string replacement
+    not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
+      name = mcn.getMethodName() and
+      arg = mcn.getArgument(0)
+    |
+      (
+        src.getAParse().(DataFlow::LocalSourceNode).flowsTo(arg) or
+        src.getAParse() = arg
+      ) and
+      name = ["sub", "sub!", "gsub", "gsub!"]
+    ) and
+    msg =
+      "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
+        "' is anchored at the " + direction +
+        ", but the other parts of this regular expression are not"
+  )
+}
+
 /**
  * Holds if `src` contains a hostname pattern that uses the `^/$` line anchors
  * rather than `\A/\z` which match the start/end of the whole string.
  */
 predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  not isSemiAnchoredHostnameRegExp(src, msg) and // avoid double reporting
+  // avoid double reporting
+  not (
+    isSemiAnchoredHostnameRegExp(src, _) or
+    hasMisleadingAnchorPrecedence(src, _)
+  ) and
   exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
     not isConstantInvalidInsideOrigin(term.getAChild*()) and
     tld = term.getAChild*() and
@@ -58,7 +204,7 @@ predicate isLineAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
  * Holds if `src` contains a hostname pattern that is missing a `$` anchor.
  */
 predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
-  // not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
+  not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
   exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
     not isConstantInvalidInsideOrigin(term.getAChild*()) and
     tld = term.getAChild*() and
@@ -81,7 +227,7 @@ predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
     hasTopLevelDomainEnding(tld) and
     not isConstantInvalidInsideOrigin(term.getAChild*()) and
     not term.getAChild*() instanceof RegExpAnchor and
-    // that is not used for capture or replace
+    // that is not used for string replacement
     not exists(DataFlow::CallNode mcn, DataFlow::Node arg, string name |
       name = mcn.getMethodName() and
       arg = mcn.getArgument(0)
@@ -101,5 +247,6 @@ from DataFlow::Node nd, string msg
 where
   isUnanchoredHostnameRegExp(nd, msg) or
   isSemiAnchoredHostnameRegExp(nd, msg) or
-  isLineAnchoredHostnameRegExp(nd, msg)
+  isLineAnchoredHostnameRegExp(nd, msg) or
+  hasMisleadingAnchorPrecedence(nd, msg)
 select nd, msg
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/MissingRegExpAnchor.expected
@@ -1,4 +1,18 @@
-| missing_regexp_anchor.rb:1:1:1:17 | /www.example.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
-| missing_regexp_anchor.rb:7:1:7:21 | /https?:\\/\\/good.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
-| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
-| missing_regexp_anchor.rb:8:1:8:22 | /^https?:\\/\\/good.com/ | This hostname pattern uses anchors such as '^' and '$', which match the start and end of a line, not the whole string. Use '\\A' and '\\z' instead. |
+| missing_regexp_anchor.rb:1:1:1:19 | /www\\.example\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:7:1:7:22 | /https?:\\/\\/good\\.com/ | When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it. |
+| missing_regexp_anchor.rb:8:1:8:23 | /^https?:\\/\\/good\\.com/ | This hostname pattern may match any domain name, as it is missing a '\\z' or '/' at the end. |
+| missing_regexp_anchor.rb:19:1:19:6 | /^a\|b/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:22:1:22:8 | /^a\|b\|c/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:28:1:28:8 | /^a\|(b)/ | Misleading operator precedence. The subexpression '^a' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:30:1:30:10 | /^(a)\|(b)/ | Misleading operator precedence. The subexpression '^(a)' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:33:1:33:6 | /a\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:36:1:36:8 | /a\|b\|c$/ | Misleading operator precedence. The subexpression 'c$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:42:1:42:8 | /(a)\|b$/ | Misleading operator precedence. The subexpression 'b$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:44:1:44:10 | /(a)\|(b)$/ | Misleading operator precedence. The subexpression '(b)$' is anchored at the end, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:46:1:46:22 | /^good.com\|better.com/ | Misleading operator precedence. The subexpression '^good.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:47:1:47:24 | /^good\\.com\|better\\.com/ | Misleading operator precedence. The subexpression '^good\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:48:1:48:26 | /^good\\\\.com\|better\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:49:1:49:28 | /^good\\\\\\.com\|better\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:50:1:50:30 | /^good\\\\\\\\.com\|better\\\\\\\\.com/ | Misleading operator precedence. The subexpression '^good\\\\\\\\.com' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression '^foo' is anchored at the beginning, but the other parts of this regular expression are not |
+| missing_regexp_anchor.rb:52:1:52:15 | /^foo\|bar\|baz$/ | Misleading operator precedence. The subexpression 'baz$' is anchored at the end, but the other parts of this regular expression are not |
diff --git a/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb b/ruby/ql/test/query-tests/security/cwe-020/MissingRegExpAnchor/missing_regexp_anchor.rb
@@ -1,18 +1,53 @@
-/www.example.com/ # BAD
-/^www.example.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
-/\Awww.example.com\z/ # GOOD
+/www\.example\.com/ # BAD
+/^www\.example\.com$/ # BAD: uses end-of-line anchors rather than end-of-string anchors
+/\Awww\.example\.com\z/ # GOOD
 
-/foo.bar/ # GOOD
+/foo\.bar/ # GOOD
 
-/https?:\/\/good.com/ # BAD
-/^https?:\/\/good.com/ # BAD: missing end-of-string anchor
-/(^https?:\/\/good1.com)|(^https?://good2.com)/ # BAD: missing end-of-string anchor
+/https?:\/\/good\.com/ # BAD
+/^https?:\/\/good\.com/ # BAD: missing end-of-string anchor
+/(^https?:\/\/good1\.com)|(^https?:#good2\.com)/ # BAD: missing end-of-string anchor
 
 /bar/ # GOOD
 
-foo.gsub(/www.example.com/, "bar") # GOOD
-foo.sub(/www.example.com/, "bar") # GOOD
-foo.gsub!(/www.example.com/, "bar") # GOOD
-foo.sub!(/www.example.com/, "bar") # GOOD
+foo.gsub(/www\.example\.com/, "bar") # GOOD
+foo.sub(/www\.example.com/, "bar") # GOOD
+foo.gsub!(/www\.example\.com/, "bar") # GOOD
+foo.sub!(/www\.example\.com/, "bar") # GOOD
 
+/^a|/
+/^a|b/ # BAD
+/a|^b/
+/^a|^b/
+/^a|b|c/ # BAD
+/a|^b|c/
+/a|b|^c/
+/^a|^b|c/
 
+/(^a)|b/
+/^a|(b)/ # BAD
+/^a|(^b)/
+/^(a)|(b)/ # BAD
+
+
+/a|b$/ # BAD
+/a$|b/
+/a$|b$/
+/a|b|c$/ # BAD
+/a|b$|c/
+/a$|b|c/
+/a|b$|c$/
+
+/a|(b$)/
+/(a)|b$/ # BAD
+/(a$)|b$/
+/(a)|(b)$/ # BAD
+
+/^good.com|better.com/ # BAD
+/^good\.com|better\.com/ # BAD
+/^good\\.com|better\\.com/ # BAD
+/^good\\\.com|better\\\.com/ # BAD
+/^good\\\\.com|better\\\\.com/ # BAD
+
+/^foo|bar|baz$/ # BAD
+/^foo|%/ # OK