Merge pull request #350 from smowton/smowton/feature/bad-regex-escape-query

max-schaefer · web-flow · commit 907ae20a1645 · 2020-09-24T09:49:16.000+01:00
Add query spotting probably-bad escapes in regular expressions.
diff --git a/change-notes/2020-09-22-suspicious-character-in-regexp.md b/change-notes/2020-09-22-suspicious-character-in-regexp.md
@@ -0,0 +1,2 @@
+lgtm,codescanning
+* A new query `go/suspicious-character-in-regex` has been added. The query flags uses of `\b` and `\a` in regular expressions, where a character class was likely intended.
diff --git a/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.go b/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.go
@@ -0,0 +1,13 @@
+package main
+
+import "regexp"
+
+func broken(hostNames []byte) string {
+	var htmlRe = regexp.MustCompile("\bforbidden.host.org")
+	if htmlRe.Match(hostNames) {
+		return "Must not target forbidden.host.org"
+	} else {
+		// This will be reached even if hostNames is exactly "forbidden.host.org",
+		// because the literal backspace is not matched
+	}
+}
diff --git a/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.qhelp b/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.qhelp
@@ -0,0 +1,66 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+  <overview>
+    <p>
+
+      When a character in a string literal or regular expression
+      literal is preceded by a backslash, it is interpreted as part of an
+      escape sequence. For example, the escape sequence <code>\n</code> in a
+      string literal corresponds to a single <code>newline</code> character,
+      and not the <code>\</code> and <code>n</code> characters.
+
+      There are two Go escape sequences that could produce surprising results.
+      First, <code>regexp.Compile("\a")</code> matches the bell character, whereas
+      <code>regexp.Compile("\\A")</code> matches the start of text and
+      <code>regexp.Compile("\\a")</code> is a Vim (but not Go) regular expression
+      matching any alphabetic character. Second, <code>regexp.Compile("\b")</code>
+      matches a backspace, whereas <code>regexp.Compile("\\b")</code> matches the
+      start of a word. Confusing one for the other could lead to a regular expression
+      passing or failing much more often than expected, with potential security
+      consequences.
+
+      Note this is less of a problem than in some other languages because in Go,
+      only valid escape sequences are accepted, both in an ordinary string
+      (for example, <code>s := "\k"</code> will not compile as there is no such
+      escape sequence) and in regular expressions (for example,
+      <code>regexp.MustCompile("\\k")</code> will panic as <code>\k</code> does not
+      refer to a character class or other special token according to Go's regular
+      expression grammar).
+
+    </p>
+
+  </overview>
+
+  <recommendation>
+    <p>
+
+      Ensure that the right number of backslashes is used when
+      escaping characters in strings and regular
+      expressions.
+
+    </p>
+  </recommendation>
+
+  <example>
+
+    <p>The following example code fails to check for a forbidden word in an input string:</p>
+    <sample src="SuspiciousCharacterInRegexp.go"/>
+    <p>The check does not work, but can be fixed by escaping the backslash:</p>
+    <sample src="SuspiciousCharacterInRegexpGood.go"/>
+    <p>
+    Alternatively, you can use backtick-delimited raw string literals. 
+    For example, the <code>\b</code> in <code>regexp.Compile(`hello\bworld`)</code>
+    matches a word boundary, not a backspace character, as within backticks <code>\b</code> is not an
+    escape sequence.
+    </p>
+
+  </example>
+
+  <references>
+    <li>golang.org: <a href="https://golang.org/pkg/regexp/">Overview of the Regexp package</a>.</li>
+    <li>Google: <a href="https://github.com/google/re2/wiki/Syntax">Syntax of regular expressions accepted by RE2</a>.</li>
+  </references>
+</qhelp>
diff --git a/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.ql b/ql/src/Security/CWE-020/SuspiciousCharacterInRegexp.ql
@@ -0,0 +1,54 @@
+/**
+ * @name Suspicious characters in a regular expression
+ * @description If a literal bell character or backspace appears in a regular expression, the start of text or word boundary may have been intended.
+ * @kind path-problem
+ * @problem.severity warning
+ * @precision high
+ * @id go/suspicious-character-in-regex
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import go
+import DataFlow::PathGraph
+
+/**
+ * Holds if `source` corresponds to a string literal that contains an escaped `character`.
+ *
+ * `character` must be `"a"` or `"b"`, the only interesting escapes for this query.
+ */
+predicate containsEscapedCharacter(DataFlow::Node source, string character) {
+  character in ["a", "b"] and
+  exists(
+    // Search for `character` preceded by an odd number of backslashes:
+    source
+        .asExpr()
+        .(BasicLit)
+        .getText()
+        .regexpFind("(?<=(^|[^\\\\])\\\\(\\\\{2}){0,10})" + character, _, _)
+  )
+}
+
+/** A dataflow configuration that traces strings containing suspicious escape sequences to a use as a regular expression. */
+class Config extends DataFlow::Configuration {
+  Config() { this = "SuspiciousRegexpEscape" }
+
+  predicate isSource(DataFlow::Node source, string report) {
+    containsEscapedCharacter(source, "a") and
+    report =
+      "the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text?"
+    or
+    containsEscapedCharacter(source, "b") and
+    report = "a literal backspace \\b; did you mean \\\\b, a word boundary?"
+  }
+
+  override predicate isSource(DataFlow::Node source) { isSource(source, _) }
+
+  override predicate isSink(DataFlow::Node sink) { sink instanceof RegexpPattern }
+}
+
+from Config c, DataFlow::PathNode source, DataFlow::PathNode sink, string report
+where c.hasFlowPath(source, sink) and c.isSource(source.getNode(), report)
+select source, source, sink, "$@ used $@ contains " + report, source, "A regular expression", sink,
+  "here"
diff --git a/ql/src/Security/CWE-020/SuspiciousCharacterInRegexpGood.go b/ql/src/Security/CWE-020/SuspiciousCharacterInRegexpGood.go
@@ -0,0 +1,13 @@
+package main
+
+import "regexp"
+
+func fixed(hostNames []byte) string {
+	var htmlRe = regexp.MustCompile("\\bforbidden.host.org")
+	if htmlRe.Match(hostNames) {
+		return "Must not target forbidden.host.org"
+	} else {
+		// hostNames definitely doesn't contain a word "forbidden.host.org", as "\\b"
+		// is the start-of-word anchor, not a literal backspace.
+	}
+}
diff --git a/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/SuspiciousCharacterInRegexp.expected b/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/SuspiciousCharacterInRegexp.expected
@@ -0,0 +1,11 @@
+edges
+nodes
+| test.go:8:21:8:34 | "hello\\aworld" | semmle.label | "hello\\aworld" |
+| test.go:9:21:9:36 | "hello\\\\\\aworld" | semmle.label | "hello\\\\\\aworld" |
+| test.go:10:21:10:34 | "hello\\bworld" | semmle.label | "hello\\bworld" |
+| test.go:11:21:11:36 | "hello\\\\\\bworld" | semmle.label | "hello\\\\\\bworld" |
+#select
+| test.go:8:21:8:34 | "hello\\aworld" | test.go:8:21:8:34 | "hello\\aworld" | test.go:8:21:8:34 | "hello\\aworld" | $@ used $@ contains the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text? | test.go:8:21:8:34 | "hello\\aworld" | A regular expression | test.go:8:21:8:34 | "hello\\aworld" | here |
+| test.go:9:21:9:36 | "hello\\\\\\aworld" | test.go:9:21:9:36 | "hello\\\\\\aworld" | test.go:9:21:9:36 | "hello\\\\\\aworld" | $@ used $@ contains the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text? | test.go:9:21:9:36 | "hello\\\\\\aworld" | A regular expression | test.go:9:21:9:36 | "hello\\\\\\aworld" | here |
+| test.go:10:21:10:34 | "hello\\bworld" | test.go:10:21:10:34 | "hello\\bworld" | test.go:10:21:10:34 | "hello\\bworld" | $@ used $@ contains a literal backspace \\b; did you mean \\\\b, a word boundary? | test.go:10:21:10:34 | "hello\\bworld" | A regular expression | test.go:10:21:10:34 | "hello\\bworld" | here |
+| test.go:11:21:11:36 | "hello\\\\\\bworld" | test.go:11:21:11:36 | "hello\\\\\\bworld" | test.go:11:21:11:36 | "hello\\\\\\bworld" | $@ used $@ contains a literal backspace \\b; did you mean \\\\b, a word boundary? | test.go:11:21:11:36 | "hello\\\\\\bworld" | A regular expression | test.go:11:21:11:36 | "hello\\\\\\bworld" | here |
diff --git a/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/SuspiciousCharacterInRegexp.qlref b/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/SuspiciousCharacterInRegexp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/SuspiciousCharacterInRegexp.ql
diff --git a/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/test.go b/ql/test/query-tests/Security/CWE-020/SuspiciousCharacterInRegexp/test.go
@@ -0,0 +1,24 @@
+package test
+
+import "regexp"
+
+func test() {
+
+	// BAD: probably a mistake:
+	regexp.MustCompile("hello\aworld")
+	regexp.MustCompile("hello\\\aworld")
+	regexp.MustCompile("hello\bworld")
+	regexp.MustCompile("hello\\\bworld")
+	// GOOD: more likely deliberate:
+	regexp.MustCompile("hello\\aworld")
+	regexp.MustCompile("hello\x07world")
+	regexp.MustCompile("hello\007world")
+	regexp.MustCompile("hello\u0007world")
+	regexp.MustCompile("hello\U00000007world")
+	regexp.MustCompile("hello\\bworld")
+	regexp.MustCompile("hello\x08world")
+	regexp.MustCompile("hello\010world")
+	regexp.MustCompile("hello\u0008world")
+	regexp.MustCompile("hello\U00000008world")
+
+}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lgtm,codescanning`
	`2`	+* A new query `go/suspicious-character-in-regex` has been added. The query flags uses of `\b` and `\a` in regular expressions, where a character class was likely intended.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Security/CWE-020/SuspiciousCharacterInRegexp.ql`