Skip to content

Commit 117fb5b

Browse files
authored
Merge pull request github#7917 from aibaars/incomplete-hostname
Ruby: IncompleteHostnameRegExp.ql
2 parents 7674535 + 4a27928 commit 117fb5b

24 files changed

+564
-107
lines changed

config/identical-files.json

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,12 @@
516516
"javascript/ql/lib/semmle/javascript/frameworks/data/internal/AccessPathSyntax.qll",
517517
"ruby/ql/lib/codeql/ruby/dataflow/internal/AccessPathSyntax.qll"
518518
],
519+
"Hostname Regexp queries": [
520+
"javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
521+
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
522+
],
519523
"ApiGraphModels": [
520524
"javascript/ql/lib/semmle/javascript/frameworks/data/internal/ApiGraphModels.qll",
521525
"ruby/ql/lib/codeql/ruby/frameworks/data/internal/ApiGraphModels.qll"
522526
]
523-
}
527+
}

javascript/ql/lib/semmle/javascript/Regexp.qll

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -990,16 +990,22 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
990990
}
991991

992992
/**
993-
* Provides regular expression patterns.
993+
* Provides utility predicates related to regular expressions.
994994
*/
995995
module RegExpPatterns {
996996
/**
997997
* Gets a pattern that matches common top-level domain names in lower case.
998998
*/
999-
string commonTLD() {
999+
string getACommonTld() {
10001000
// according to ranking by http://google.com/search?q=site:.<<TLD>>
10011001
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
10021002
}
1003+
1004+
/**
1005+
* Gets a pattern that matches common top-level domain names in lower case.
1006+
* DEPRECATED: use `getACommonTld` instead
1007+
*/
1008+
deprecated predicate commonTLD = getACommonTld/0;
10031009
}
10041010

10051011
/**

javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll

Lines changed: 95 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* that match URLs and hostname patterns.
44
*/
55

6-
import javascript
6+
private import HostnameRegexpSpecific
77

88
/**
99
* Holds if the given constant is unlikely to occur in the origin part of a URL.
@@ -62,7 +62,7 @@ predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
6262
seq.getChild(i)
6363
.(RegExpConstant)
6464
.getValue()
65-
.regexpMatch("(?i)" + RegExpPatterns::commonTLD() + "(:\\d+)?([/?#].*)?") and
65+
.regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
6666
isDotLike(seq.getChild(i - 1)) and
6767
not (i = 1 and matchesBeginningOfString(seq))
6868
}
@@ -107,3 +107,96 @@ predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
107107
alwaysMatchesHostnameAlt(alt, i - 1) and
108108
alwaysMatchesHostname(alt.getChild(i))
109109
}
110+
111+
/**
112+
* Holds if `term` occurs inside a quantifier or alternative (and thus
113+
* can not be expected to correspond to a unique match), or as part of
114+
* a lookaround assertion (which are rarely used for capture groups).
115+
*/
116+
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
117+
exists(RegExpParent parent | parent = term.getParent() |
118+
parent instanceof RegExpAlt
119+
or
120+
parent instanceof RegExpQuantifier
121+
or
122+
parent instanceof RegExpSubPattern
123+
or
124+
isInsideChoiceOrSubPattern(parent)
125+
)
126+
}
127+
128+
/**
129+
* Holds if `group` is likely to be used as a capture group.
130+
*/
131+
predicate isLikelyCaptureGroup(RegExpGroup group) {
132+
group.isCapture() and
133+
not isInsideChoiceOrSubPattern(group)
134+
}
135+
136+
/**
137+
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
138+
*
139+
* At least one of these dots is not intended to be a subdomain separator,
140+
* so we avoid flagging the pattern in this case.
141+
*/
142+
predicate hasConsecutiveDots(RegExpSequence seq) {
143+
exists(int i |
144+
isDotLike(seq.getChild(i)) and
145+
isDotLike(seq.getChild(i + 1))
146+
)
147+
}
148+
149+
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
150+
seq = regexp.getAChild*() and
151+
exists(RegExpDot unescapedDot, int i, string hostname |
152+
hasTopLevelDomainEnding(seq, i) and
153+
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
154+
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
155+
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
156+
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
157+
not hasConsecutiveDots(unescapedDot.getParent()) and
158+
hostname =
159+
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
160+
seq.getChild(i).getRawValue()
161+
|
162+
if unescapedDot.getParent() instanceof RegExpQuantifier
163+
then
164+
// `.*\.example.com` can match `evil.com/?x=.example.com`
165+
//
166+
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167+
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168+
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169+
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170+
seq.getChild(0) instanceof RegExpCaret and
171+
not seq.getAChild() instanceof RegExpDollar and
172+
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
173+
msg =
174+
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
175+
+ "' which may cause '" + hostname +
176+
"' to be matched anywhere in the URL, outside the hostname."
177+
else
178+
msg =
179+
"has an unescaped '.' before '" + hostname +
180+
"', so it might match more hosts than expected."
181+
)
182+
}
183+
184+
predicate incompleteHostnameRegExp(
185+
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
186+
) {
187+
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
188+
regexp = re.getRegExpTerm() and
189+
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
190+
(
191+
if re.getAParse() != re
192+
then (
193+
kind = "string, which is used as a regular expression $@," and
194+
aux = re.getAParse()
195+
) else (
196+
kind = "regular expression" and aux = re
197+
)
198+
)
199+
|
200+
message = "This " + kind + " " + msg and label = "here"
201+
)
202+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
import javascript

javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
<p>
3131

3232
Escape all meta-characters appropriately when constructing
33-
regular expressions for security checks, pay special attention to the
33+
regular expressions for security checks, and pay special attention to the
3434
<code>.</code> meta-character.
3535

3636
</p>
@@ -59,7 +59,7 @@
5959
<p>
6060

6161
Address this vulnerability by escaping <code>.</code>
62-
appropriately: <code>let regex = /((www|beta)\.)?example\.com/</code>.
62+
appropriately: <code>let regex = /^((www|beta)\.)?example\.com/</code>.
6363

6464
</p>
6565

javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 1 addition & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -11,97 +11,6 @@
1111
* external/cwe/cwe-020
1212
*/
1313

14-
import javascript
15-
import semmle.javascript.CharacterEscapes
1614
import HostnameRegexpShared
1715

18-
/**
19-
* Holds if `term` occurs inside a quantifier or alternative (and thus
20-
* can not be expected to correspond to a unique match), or as part of
21-
* a lookaround assertion (which are rarely used for capture groups).
22-
*/
23-
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
24-
exists(RegExpParent parent | parent = term.getParent() |
25-
parent instanceof RegExpAlt
26-
or
27-
parent instanceof RegExpQuantifier
28-
or
29-
parent instanceof RegExpSubPattern
30-
or
31-
isInsideChoiceOrSubPattern(parent)
32-
)
33-
}
34-
35-
/**
36-
* Holds if `group` is likely to be used as a capture group.
37-
*/
38-
predicate isLikelyCaptureGroup(RegExpGroup group) {
39-
group.isCapture() and
40-
not isInsideChoiceOrSubPattern(group)
41-
}
42-
43-
/**
44-
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
45-
*
46-
* At least one of these dots is not intended to be a subdomain separator,
47-
* so we avoid flagging the pattern in this case.
48-
*/
49-
predicate hasConsecutiveDots(RegExpSequence seq) {
50-
exists(int i |
51-
isDotLike(seq.getChild(i)) and
52-
isDotLike(seq.getChild(i + 1))
53-
)
54-
}
55-
56-
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
57-
seq = regexp.getAChild*() and
58-
exists(RegExpDot unescapedDot, int i, string hostname |
59-
hasTopLevelDomainEnding(seq, i) and
60-
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
61-
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
62-
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
63-
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
64-
not hasConsecutiveDots(unescapedDot.getParent()) and
65-
hostname =
66-
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
67-
seq.getChild(i).getRawValue()
68-
|
69-
if unescapedDot.getParent() instanceof RegExpQuantifier
70-
then
71-
// `.*\.example.com` can match `evil.com/?x=.example.com`
72-
//
73-
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
74-
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
75-
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
76-
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
77-
seq.getChild(0) instanceof RegExpCaret and
78-
not seq.getAChild() instanceof RegExpDollar and
79-
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
80-
msg =
81-
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
82-
+ "' which may cause '" + hostname +
83-
"' to be matched anywhere in the URL, outside the hostname."
84-
else
85-
msg =
86-
"has an unescaped '.' before '" + hostname +
87-
"', so it might match more hosts than expected."
88-
)
89-
}
90-
91-
from
92-
RegExpPatternSource re, RegExpTerm regexp, RegExpSequence hostSequence, string msg, string kind,
93-
DataFlow::Node aux
94-
where
95-
regexp = re.getRegExpTerm() and
96-
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
97-
(
98-
if re.getAParse() != re
99-
then (
100-
kind = "string, which is used as a regular expression $@," and
101-
aux = re.getAParse()
102-
) else (
103-
kind = "regular expression" and aux = re
104-
)
105-
) and
106-
not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
107-
select hostSequence, "This " + kind + " " + msg, aux, "here"
16+
query predicate problems = incompleteHostnameRegExp/4;

javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ where
3939
(
4040
// target contains a domain on a common TLD, and perhaps some other URL components
4141
target
42-
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::commonTLD() +
42+
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
4343
"(:[0-9]+)?/?")
4444
or
4545
// target is a HTTP URL to a domain on any TLD

javascript/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,5 @@
2323
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unescaped '.' before 'example\\.com/', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
2424
| tst-IncompleteHostnameRegExp.js:48:42:48:47 | ^https?://.+.example\\.com/ | This regular expression has an unrestricted wildcard '.+' which may cause 'example\\.com/' to be matched anywhere in the URL, outside the hostname. | tst-IncompleteHostnameRegExp.js:48:13:48:69 | '^http: ... \\.com/' | here |
2525
| tst-IncompleteHostnameRegExp.js:53:14:53:35 | test.example.com$ | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:53:13:53:36 | 'test.' ... e.com$' | here |
26+
| tst-IncompleteHostnameRegExp.js:55:14:55:38 | ^http://test.example.com | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:55:13:55:39 | '^http: ... le.com' | here |
2627
| tst-IncompleteHostnameRegExp.js:59:5:59:20 | foo.example\\.com | This regular expression has an unescaped '.' before 'example\\.com', so it might match more hosts than expected. | tst-IncompleteHostnameRegExp.js:59:2:59:32 | /^(foo. ... ever)$/ | here |

javascript/ql/test/query-tests/Security/CWE-020/tst-IncompleteHostnameRegExp.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252

5353
new RegExp('test.' + 'example.com$'); // NOT OK
5454

55-
new RegExp('^http://test\.example.com'); // NOT OK, but flagged by js/useless-regexp-character-escape
55+
new RegExp('^http://test\.example.com'); // NOT OK
5656

5757
/^http:\/\/(..|...)\.example\.com\/index\.html/; // OK, wildcards are intentional
5858
/^http:\/\/.\.example\.com\/index\.html/; // OK, the wildcard is intentional

python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
<p>
3131

3232
Escape all meta-characters appropriately when constructing
33-
regular expressions for security checks, pay special attention to the
33+
regular expressions for security checks, and pay special attention to the
3434
<code>.</code> meta-character.
3535

3636
</p>

0 commit comments

Comments
 (0)