forked from github/codeql
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMissingRegexpAnchor.ql
More file actions
83 lines (74 loc) · 2.99 KB
/
MissingRegexpAnchor.ql
File metadata and controls
83 lines (74 loc) · 2.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/**
* @name Missing regular expression anchor
* @description Regular expressions without anchors can be vulnerable to bypassing.
* @kind problem
* @problem.severity warning
* @security-severity 7.8
* @precision high
* @id go/regex/missing-regexp-anchor
* @tags correctness
* security
* external/cwe/cwe-020
*/
import go
/**
* Holds if `re` is a pattern for a collection of alternatives where
* only the first or last alternative is anchored, indicating a
* precedence mistake explained by `msg`.
*
* The canonical example of such a mistake is: `^a|b|c`, which is
* parsed as `(^a)|(b)|(c)`.
*/
bindingset[re]
predicate isInterestingSemiAnchoredRegexpString(string re, string msg) {
exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
// a dot that might be escaped in a regular expression, for example `regexp.Compile("\\.")`
escapedDot = "\\\\[.]" and
// a string that is mostly free from special reqular expression symbols
str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
// the string may be wrapped in parentheses
maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
(
// a problematic pattern: `^a|b|...|x`
regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
or
// a problematic pattern: `a|b|...|x$`
regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
) and
anchorPart = re.regexpCapture(regex, 1) and
anchorPart.regexpMatch("(?i).*[a-z].*") and
msg =
"Misleading operator precedence. The subexpression '" + anchorPart +
"' is anchored, but the other parts of this regular expression are not."
)
}
/**
* Holds if `re` is an unanchored pattern for a URL, indicating a
* mistake explained by `msg`.
*/
bindingset[re]
predicate isInterestingUnanchoredRegexpString(string re, string msg) {
// a substring sequence of a protocol and subdomains, perhaps with some regex characters mixed in, followed by a known TLD
re.regexpMatch("(?i)[():|?a-z0-9-\\\\./]+[.]" + commonTld() + "([/#?():]\\S*)?") and
// without any anchors
not re.regexpMatch(".*(\\$|\\^|\\\\A|\\\\z).*") and
msg =
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary " +
"hosts may come before or after it."
}
module Config implements DataFlow::ConfigSig {
additional predicate isSourceString(DataFlow::Node source, string msg) {
exists(Expr e | e = source.asExpr() |
isInterestingUnanchoredRegexpString(e.getStringValue(), msg)
or
isInterestingSemiAnchoredRegexpString(e.getStringValue(), msg)
)
}
predicate isSource(DataFlow::Node source) { isSourceString(source, _) }
predicate isSink(DataFlow::Node sink) { sink instanceof RegexpPattern }
predicate observeDiffInformedIncrementalMode() { any() }
}
module Flow = DataFlow::Global<Config>;
from DataFlow::Node source, string msg
where Flow::flow(source, _) and Config::isSourceString(source, msg)
select source, msg