cklin
diff --git a/‎config/identical-files.json
Lines changed: 5 additions & 1 deletion b/‎config/identical-files.json
Lines changed: 5 additions & 1 deletion
diff --git a/‎javascript/ql/lib/semmle/javascript/Regexp.qll
Lines changed: 8 additions & 2 deletions b/‎javascript/ql/lib/semmle/javascript/Regexp.qll
Lines changed: 8 additions & 2 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
Lines changed: 95 additions & 2 deletions b/‎javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll
Lines changed: 95 additions & 2 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
Lines changed: 1 addition & 0 deletions b/‎javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll
Lines changed: 1 addition & 0 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
Lines changed: 2 additions & 2 deletions b/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
Lines changed: 2 additions & 2 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Lines changed: 1 addition & 92 deletions b/‎javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
Lines changed: 1 addition & 92 deletions
diff --git a/‎javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
Lines changed: 1 addition & 1 deletion b/‎javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
Lines changed: 1 addition & 1 deletion
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
Lines changed: 1 addition & 0 deletions b/‎javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
Lines changed: 1 addition & 0 deletions
diff --git a/‎javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteHostnameRegExp.js
Lines changed: 1 addition & 1 deletion b/‎javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteHostnameRegExp.js
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
Lines changed: 1 addition & 1 deletion b/‎python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
Lines changed: 1 addition & 1 deletion
@@ -516,8 +516,12 @@
     "javascript/ql/lib/semmle/javascript/frameworks/data/internal/AccessPathSyntax.qll",
     "ruby/ql/lib/codeql/ruby/dataflow/internal/AccessPathSyntax.qll"
   ],
+  "Hostname Regexp queries": [
+    "javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
+    "ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
+  ],
   "ApiGraphModels": [
     "javascript/ql/lib/semmle/javascript/frameworks/data/internal/ApiGraphModels.qll",
     "ruby/ql/lib/codeql/ruby/frameworks/data/internal/ApiGraphModels.qll"
   ]
-}
+}
@@ -990,16 +990,22 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
 }
 
 /**
- * Provides regular expression patterns.
+ * Provides utility predicates related to regular expressions.
  */
 module RegExpPatterns {
   /**
    * Gets a pattern that matches common top-level domain names in lower case.
    */
-  string commonTLD() {
+  string getACommonTld() {
     // according to ranking by http://google.com/search?q=site:.<<TLD>>
     result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
   }
+
+  /**
+   * Gets a pattern that matches common top-level domain names in lower case.
+   * DEPRECATED: use `getACommonTld` instead
+   */
+  deprecated predicate commonTLD = getACommonTld/0;
 }
 
 /**
 
@@ -3,7 +3,7 @@
  * that match URLs and hostname patterns.
  */
 
-import javascript
+private import HostnameRegexpSpecific
 
 /**
  * Holds if the given constant is unlikely to occur in the origin part of a URL.
@@ -62,7 +62,7 @@ predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
   seq.getChild(i)
       .(RegExpConstant)
       .getValue()
-      .regexpMatch("(?i)" + RegExpPatterns::commonTLD() + "(:\\d+)?([/?#].*)?") and
+      .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
   isDotLike(seq.getChild(i - 1)) and
   not (i = 1 and matchesBeginningOfString(seq))
 }
@@ -107,3 +107,96 @@ predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
   alwaysMatchesHostnameAlt(alt, i - 1) and
   alwaysMatchesHostname(alt.getChild(i))
 }
+
+/**
+ * Holds if `term` occurs inside a quantifier or alternative (and thus
+ * can not be expected to correspond to a unique match), or as part of
+ * a lookaround assertion (which are rarely used for capture groups).
+ */
+predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
+  exists(RegExpParent parent | parent = term.getParent() |
+    parent instanceof RegExpAlt
+    or
+    parent instanceof RegExpQuantifier
+    or
+    parent instanceof RegExpSubPattern
+    or
+    isInsideChoiceOrSubPattern(parent)
+  )
+}
+
+/**
+ * Holds if `group` is likely to be used as a capture group.
+ */
+predicate isLikelyCaptureGroup(RegExpGroup group) {
+  group.isCapture() and
+  not isInsideChoiceOrSubPattern(group)
+}
+
+/**
+ * Holds if `seq` contains two consecutive dots `..` or escaped dots.
+ *
+ * At least one of these dots is not intended to be a subdomain separator,
+ * so we avoid flagging the pattern in this case.
+ */
+predicate hasConsecutiveDots(RegExpSequence seq) {
+  exists(int i |
+    isDotLike(seq.getChild(i)) and
+    isDotLike(seq.getChild(i + 1))
+  )
+}
+
+predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
+  seq = regexp.getAChild*() and
+  exists(RegExpDot unescapedDot, int i, string hostname |
+    hasTopLevelDomainEnding(seq, i) and
+    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
+    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
+    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
+    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
+    not hasConsecutiveDots(unescapedDot.getParent()) and
+    hostname =
+      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
+        seq.getChild(i).getRawValue()
+  |
+    if unescapedDot.getParent() instanceof RegExpQuantifier
+    then
+      // `.*\.example.com` can match `evil.com/?x=.example.com`
+      //
+      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
+      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
+      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
+      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
+      seq.getChild(0) instanceof RegExpCaret and
+      not seq.getAChild() instanceof RegExpDollar and
+      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
+      msg =
+        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
+          + "' which may cause '" + hostname +
+          "' to be matched anywhere in the URL, outside the hostname."
+    else
+      msg =
+        "has an unescaped '.' before '" + hostname +
+          "', so it might match more hosts than expected."
+  )
+}
+
+predicate incompleteHostnameRegExp(
+  RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
+) {
+  exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
+    regexp = re.getRegExpTerm() and
+    isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
+    (
+      if re.getAParse() != re
+      then (
+        kind = "string, which is used as a regular expression $@," and
+        aux = re.getAParse()
+      ) else (
+        kind = "regular expression" and aux = re
+      )
+    )
+  |
+    message = "This " + kind + " " + msg and label = "here"
+  )
+}
@@ -0,0 +1 @@
+import javascript
@@ -30,7 +30,7 @@
 		<p>
 
 			Escape all meta-characters appropriately when constructing
-			regular expressions for security checks, pay special attention to the
+			regular expressions for security checks, and pay special attention to the
 			<code>.</code> meta-character.
 
 		</p>
@@ -59,7 +59,7 @@
 		<p>
 
 			Address this vulnerability by escaping <code>.</code>
-			appropriately: <code>let regex = /((www|beta)\.)?example\.com/</code>.
+			appropriately: <code>let regex = /^((www|beta)\.)?example\.com/</code>.
 
 		</p>
 
 
@@ -11,97 +11,6 @@
  *       external/cwe/cwe-020
  */
 
-import javascript
-import semmle.javascript.CharacterEscapes
 import HostnameRegexpShared
 
-/**
- * Holds if `term` occurs inside a quantifier or alternative (and thus
- * can not be expected to correspond to a unique match), or as part of
- * a lookaround assertion (which are rarely used for capture groups).
- */
-predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
-  exists(RegExpParent parent | parent = term.getParent() |
-    parent instanceof RegExpAlt
-    or
-    parent instanceof RegExpQuantifier
-    or
-    parent instanceof RegExpSubPattern
-    or
-    isInsideChoiceOrSubPattern(parent)
-  )
-}
-
-/**
- * Holds if `group` is likely to be used as a capture group.
- */
-predicate isLikelyCaptureGroup(RegExpGroup group) {
-  group.isCapture() and
-  not isInsideChoiceOrSubPattern(group)
-}
-
-/**
- * Holds if `seq` contains two consecutive dots `..` or escaped dots.
- *
- * At least one of these dots is not intended to be a subdomain separator,
- * so we avoid flagging the pattern in this case.
- */
-predicate hasConsecutiveDots(RegExpSequence seq) {
-  exists(int i |
-    isDotLike(seq.getChild(i)) and
-    isDotLike(seq.getChild(i + 1))
-  )
-}
-
-predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
-  seq = regexp.getAChild*() and
-  exists(RegExpDot unescapedDot, int i, string hostname |
-    hasTopLevelDomainEnding(seq, i) and
-    not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
-    not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
-    unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
-    unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
-    not hasConsecutiveDots(unescapedDot.getParent()) and
-    hostname =
-      seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
-        seq.getChild(i).getRawValue()
-  |
-    if unescapedDot.getParent() instanceof RegExpQuantifier
-    then
-      // `.*\.example.com` can match `evil.com/?x=.example.com`
-      //
-      // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
-      // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
-      // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
-      // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
-      seq.getChild(0) instanceof RegExpCaret and
-      not seq.getAChild() instanceof RegExpDollar and
-      seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
-      msg =
-        "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
-          + "' which may cause '" + hostname +
-          "' to be matched anywhere in the URL, outside the hostname."
-    else
-      msg =
-        "has an unescaped '.' before '" + hostname +
-          "', so it might match more hosts than expected."
-  )
-}
-
-from
-  RegExpPatternSource re, RegExpTerm regexp, RegExpSequence hostSequence, string msg, string kind,
-  DataFlow::Node aux
-where
-  regexp = re.getRegExpTerm() and
-  isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
-  (
-    if re.getAParse() != re
-    then (
-      kind = "string, which is used as a regular expression $@," and
-      aux = re.getAParse()
-    ) else (
-      kind = "regular expression" and aux = re
-    )
-  ) and
-  not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
-select hostSequence, "This " + kind + " " + msg, aux, "here"
+query predicate problems = incompleteHostnameRegExp/4;
@@ -39,7 +39,7 @@ where
   (
     // target contains a domain on a common TLD, and perhaps some other URL components
     target
-        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
+        .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
             "(:[0-9]+)?/?")
     or
     // target is a HTTP URL to a domain on any TLD
 
@@ -23,4 +23,5 @@
 | tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
 | tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
 | tst-IncompleteHostnameRegExp.js:53:14:53:35 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
+| tst-IncompleteHostnameRegExp.js:55:14:55:38 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:55:13:55:39 | '^http: ... le.com' | here |
 | tst-IncompleteHostnameRegExp.js:59:5:59:20 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:59:2:59:32 | /^(foo. ... ever)$/ | here |
@@ -52,7 +52,7 @@
 
 	new RegExp('test.' + 'example.com$'); // NOT OK
 
-	new RegExp('^http://test\.example.com'); // NOT OK, but flagged by js/useless-regexp-character-escape
+	new RegExp('^http://test\.example.com'); // NOT OK
 
 	/^http:\/\/(..|...)\.example\.com\/index\.html/; // OK, wildcards are intentional
 	/^http:\/\/.\.example\.com\/index\.html/; // OK, the wildcard is intentional
 
@@ -30,7 +30,7 @@
         <p>
 
             Escape all meta-characters appropriately when constructing
-            regular expressions for security checks, pay special attention to the
+            regular expressions for security checks, and pay special attention to the
             <code>.</code> meta-character.
 
         </p>
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ where`
`39`	`39`	`(`
`40`	`40`	`// target contains a domain on a common TLD, and perhaps some other URL components`
`41`	`41`	`target`
`42`		`- .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +`
	`42`	`+ .regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +`
`43`	`43`	`"(:[0-9]+)?/?")`
`44`	`44`	`or`
`45`	`45`	`// target is a HTTP URL to a domain on any TLD`