Skip to content
This repository was archived by the owner on Jan 5, 2023. It is now read-only.

Commit 907ae20

Browse files
authored
Merge pull request #350 from smowton/smowton/feature/bad-regex-escape-query
Add query spotting probably-bad escapes in regular expressions.
2 parents a094ddb + 5913804 commit 907ae20

File tree

8 files changed

+184
-0
lines changed

8 files changed

+184
-0
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
lgtm,codescanning
2+
* A new query `go/suspicious-character-in-regex` has been added. The query flags uses of `\b` and `\a` in regular expressions, where a character class was likely intended.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package main
2+
3+
import "regexp"
4+
5+
func broken(hostNames []byte) string {
6+
var htmlRe = regexp.MustCompile("\bforbidden.host.org")
7+
if htmlRe.Match(hostNames) {
8+
return "Must not target forbidden.host.org"
9+
} else {
10+
// This will be reached even if hostNames is exactly "forbidden.host.org",
11+
// because the literal backspace is not matched
12+
}
13+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
9+
When a character in a string literal or regular expression
10+
literal is preceded by a backslash, it is interpreted as part of an
11+
escape sequence. For example, the escape sequence <code>\n</code> in a
12+
string literal corresponds to a single <code>newline</code> character,
13+
and not the <code>\</code> and <code>n</code> characters.
14+
15+
There are two Go escape sequences that could produce surprising results.
16+
First, <code>regexp.Compile("\a")</code> matches the bell character, whereas
17+
<code>regexp.Compile("\\A")</code> matches the start of text and
18+
<code>regexp.Compile("\\a")</code> is a Vim (but not Go) regular expression
19+
matching any alphabetic character. Second, <code>regexp.Compile("\b")</code>
20+
matches a backspace, whereas <code>regexp.Compile("\\b")</code> matches the
21+
start of a word. Confusing one for the other could lead to a regular expression
22+
passing or failing much more often than expected, with potential security
23+
consequences.
24+
25+
Note this is less of a problem than in some other languages because in Go,
26+
only valid escape sequences are accepted, both in an ordinary string
27+
(for example, <code>s := "\k"</code> will not compile as there is no such
28+
escape sequence) and in regular expressions (for example,
29+
<code>regexp.MustCompile("\\k")</code> will panic as <code>\k</code> does not
30+
refer to a character class or other special token according to Go's regular
31+
expression grammar).
32+
33+
</p>
34+
35+
</overview>
36+
37+
<recommendation>
38+
<p>
39+
40+
Ensure that the right number of backslashes is used when
41+
escaping characters in strings and regular
42+
expressions.
43+
44+
</p>
45+
</recommendation>
46+
47+
<example>
48+
49+
<p>The following example code fails to check for a forbidden word in an input string:</p>
50+
<sample src="SuspiciousCharacterInRegexp.go"/>
51+
<p>The check does not work, but can be fixed by escaping the backslash:</p>
52+
<sample src="SuspiciousCharacterInRegexpGood.go"/>
53+
<p>
54+
Alternatively, you can use backtick-delimited raw string literals.
55+
For example, the <code>\b</code> in <code>regexp.Compile(`hello\bworld`)</code>
56+
matches a word boundary, not a backspace character, as within backticks <code>\b</code> is not an
57+
escape sequence.
58+
</p>
59+
60+
</example>
61+
62+
<references>
63+
<li>golang.org: <a href="https://golang.org/pkg/regexp/">Overview of the Regexp package</a>.</li>
64+
<li>Google: <a href="https://github.com/google/re2/wiki/Syntax">Syntax of regular expressions accepted by RE2</a>.</li>
65+
</references>
66+
</qhelp>
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/**
2+
* @name Suspicious characters in a regular expression
3+
* @description If a literal bell character or backspace appears in a regular expression, the start of text or word boundary may have been intended.
4+
* @kind path-problem
5+
* @problem.severity warning
6+
* @precision high
7+
* @id go/suspicious-character-in-regex
8+
* @tags correctness
9+
* security
10+
* external/cwe/cwe-20
11+
*/
12+
13+
import go
14+
import DataFlow::PathGraph
15+
16+
/**
17+
* Holds if `source` corresponds to a string literal that contains an escaped `character`.
18+
*
19+
* `character` must be `"a"` or `"b"`, the only interesting escapes for this query.
20+
*/
21+
predicate containsEscapedCharacter(DataFlow::Node source, string character) {
22+
character in ["a", "b"] and
23+
exists(
24+
// Search for `character` preceded by an odd number of backslashes:
25+
source
26+
.asExpr()
27+
.(BasicLit)
28+
.getText()
29+
.regexpFind("(?<=(^|[^\\\\])\\\\(\\\\{2}){0,10})" + character, _, _)
30+
)
31+
}
32+
33+
/** A dataflow configuration that traces strings containing suspicious escape sequences to a use as a regular expression. */
34+
class Config extends DataFlow::Configuration {
35+
Config() { this = "SuspiciousRegexpEscape" }
36+
37+
predicate isSource(DataFlow::Node source, string report) {
38+
containsEscapedCharacter(source, "a") and
39+
report =
40+
"the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text?"
41+
or
42+
containsEscapedCharacter(source, "b") and
43+
report = "a literal backspace \\b; did you mean \\\\b, a word boundary?"
44+
}
45+
46+
override predicate isSource(DataFlow::Node source) { isSource(source, _) }
47+
48+
override predicate isSink(DataFlow::Node sink) { sink instanceof RegexpPattern }
49+
}
50+
51+
from Config c, DataFlow::PathNode source, DataFlow::PathNode sink, string report
52+
where c.hasFlowPath(source, sink) and c.isSource(source.getNode(), report)
53+
select source, source, sink, "$@ used $@ contains " + report, source, "A regular expression", sink,
54+
"here"
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package main
2+
3+
import "regexp"
4+
5+
func fixed(hostNames []byte) string {
6+
var htmlRe = regexp.MustCompile("\\bforbidden.host.org")
7+
if htmlRe.Match(hostNames) {
8+
return "Must not target forbidden.host.org"
9+
} else {
10+
// hostNames definitely doesn't contain a word "forbidden.host.org", as "\\b"
11+
// is the start-of-word anchor, not a literal backspace.
12+
}
13+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
edges
2+
nodes
3+
| test.go:8:21:8:34 | "hello\\aworld" | semmle.label | "hello\\aworld" |
4+
| test.go:9:21:9:36 | "hello\\\\\\aworld" | semmle.label | "hello\\\\\\aworld" |
5+
| test.go:10:21:10:34 | "hello\\bworld" | semmle.label | "hello\\bworld" |
6+
| test.go:11:21:11:36 | "hello\\\\\\bworld" | semmle.label | "hello\\\\\\bworld" |
7+
#select
8+
| test.go:8:21:8:34 | "hello\\aworld" | test.go:8:21:8:34 | "hello\\aworld" | test.go:8:21:8:34 | "hello\\aworld" | $@ used $@ contains the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text? | test.go:8:21:8:34 | "hello\\aworld" | A regular expression | test.go:8:21:8:34 | "hello\\aworld" | here |
9+
| test.go:9:21:9:36 | "hello\\\\\\aworld" | test.go:9:21:9:36 | "hello\\\\\\aworld" | test.go:9:21:9:36 | "hello\\\\\\aworld" | $@ used $@ contains the bell character \\a; did you mean \\\\a, the Vim alphabetic character class (use [[:alpha:]] instead) or \\\\A, the beginning of text? | test.go:9:21:9:36 | "hello\\\\\\aworld" | A regular expression | test.go:9:21:9:36 | "hello\\\\\\aworld" | here |
10+
| test.go:10:21:10:34 | "hello\\bworld" | test.go:10:21:10:34 | "hello\\bworld" | test.go:10:21:10:34 | "hello\\bworld" | $@ used $@ contains a literal backspace \\b; did you mean \\\\b, a word boundary? | test.go:10:21:10:34 | "hello\\bworld" | A regular expression | test.go:10:21:10:34 | "hello\\bworld" | here |
11+
| test.go:11:21:11:36 | "hello\\\\\\bworld" | test.go:11:21:11:36 | "hello\\\\\\bworld" | test.go:11:21:11:36 | "hello\\\\\\bworld" | $@ used $@ contains a literal backspace \\b; did you mean \\\\b, a word boundary? | test.go:11:21:11:36 | "hello\\\\\\bworld" | A regular expression | test.go:11:21:11:36 | "hello\\\\\\bworld" | here |
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Security/CWE-020/SuspiciousCharacterInRegexp.ql
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
package test
2+
3+
import "regexp"
4+
5+
func test() {
6+
7+
// BAD: probably a mistake:
8+
regexp.MustCompile("hello\aworld")
9+
regexp.MustCompile("hello\\\aworld")
10+
regexp.MustCompile("hello\bworld")
11+
regexp.MustCompile("hello\\\bworld")
12+
// GOOD: more likely deliberate:
13+
regexp.MustCompile("hello\\aworld")
14+
regexp.MustCompile("hello\x07world")
15+
regexp.MustCompile("hello\007world")
16+
regexp.MustCompile("hello\u0007world")
17+
regexp.MustCompile("hello\U00000007world")
18+
regexp.MustCompile("hello\\bworld")
19+
regexp.MustCompile("hello\x08world")
20+
regexp.MustCompile("hello\010world")
21+
regexp.MustCompile("hello\u0008world")
22+
regexp.MustCompile("hello\U00000008world")
23+
24+
}

0 commit comments

Comments
 (0)