Skip to content

Commit 98f56f4

Browse files
committed
Js/Ruby: Share IncompleteHostnameRegExp.ql
1 parent 097c661 commit 98f56f4

File tree

8 files changed

+200
-188
lines changed

8 files changed

+200
-188
lines changed

config/identical-files.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -508,5 +508,9 @@
508508
"java/ql/lib/semmle/code/java/dataflow/internal/AccessPathSyntax.qll",
509509
"javascript/ql/lib/semmle/javascript/frameworks/data/internal/AccessPathSyntax.qll",
510510
"ruby/ql/lib/codeql/ruby/dataflow/internal/AccessPathSyntax.qll"
511+
],
512+
"Hostname Regexp queries": [
513+
"javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
514+
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
511515
]
512516
}

javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* that match URLs and hostname patterns.
44
*/
55

6-
import javascript
6+
private import HostnameRegexpSpecific
77

88
/**
99
* Holds if the given constant is unlikely to occur in the origin part of a URL.
@@ -107,3 +107,96 @@ predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
107107
alwaysMatchesHostnameAlt(alt, i - 1) and
108108
alwaysMatchesHostname(alt.getChild(i))
109109
}
110+
111+
/**
112+
* Holds if `term` occurs inside a quantifier or alternative (and thus
113+
* can not be expected to correspond to a unique match), or as part of
114+
* a lookaround assertion (which are rarely used for capture groups).
115+
*/
116+
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
117+
exists(RegExpParent parent | parent = term.getParent() |
118+
parent instanceof RegExpAlt
119+
or
120+
parent instanceof RegExpQuantifier
121+
or
122+
parent instanceof RegExpSubPattern
123+
or
124+
isInsideChoiceOrSubPattern(parent)
125+
)
126+
}
127+
128+
/**
129+
* Holds if `group` is likely to be used as a capture group.
130+
*/
131+
predicate isLikelyCaptureGroup(RegExpGroup group) {
132+
group.isCapture() and
133+
not isInsideChoiceOrSubPattern(group)
134+
}
135+
136+
/**
137+
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
138+
*
139+
* At least one of these dots is not intended to be a subdomain separator,
140+
* so we avoid flagging the pattern in this case.
141+
*/
142+
predicate hasConsecutiveDots(RegExpSequence seq) {
143+
exists(int i |
144+
isDotLike(seq.getChild(i)) and
145+
isDotLike(seq.getChild(i + 1))
146+
)
147+
}
148+
149+
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
150+
seq = regexp.getAChild*() and
151+
exists(RegExpDot unescapedDot, int i, string hostname |
152+
hasTopLevelDomainEnding(seq, i) and
153+
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
154+
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
155+
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
156+
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
157+
not hasConsecutiveDots(unescapedDot.getParent()) and
158+
hostname =
159+
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
160+
seq.getChild(i).getRawValue()
161+
|
162+
if unescapedDot.getParent() instanceof RegExpQuantifier
163+
then
164+
// `.*\.example.com` can match `evil.com/?x=.example.com`
165+
//
166+
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167+
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168+
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169+
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170+
seq.getChild(0) instanceof RegExpCaret and
171+
not seq.getAChild() instanceof RegExpDollar and
172+
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
173+
msg =
174+
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
175+
+ "' which may cause '" + hostname +
176+
"' to be matched anywhere in the URL, outside the hostname."
177+
else
178+
msg =
179+
"has an unescaped '.' before '" + hostname +
180+
"', so it might match more hosts than expected."
181+
)
182+
}
183+
184+
predicate incompleteHostnameRegExp(
185+
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
186+
) {
187+
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
188+
regexp = re.getRegExpTerm() and
189+
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
190+
(
191+
if re.getAParse() != re
192+
then (
193+
kind = "string, which is used as a regular expression $@," and
194+
aux = re.getAParse()
195+
) else (
196+
kind = "regular expression" and aux = re
197+
)
198+
)
199+
|
200+
message = "This " + kind + " " + msg and label = "here"
201+
)
202+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
import javascript

javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 1 addition & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -11,97 +11,6 @@
1111
* external/cwe/cwe-020
1212
*/
1313

14-
import javascript
15-
import semmle.javascript.CharacterEscapes
1614
import HostnameRegexpShared
1715

18-
/**
19-
* Holds if `term` occurs inside a quantifier or alternative (and thus
20-
* can not be expected to correspond to a unique match), or as part of
21-
* a lookaround assertion (which are rarely used for capture groups).
22-
*/
23-
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
24-
exists(RegExpParent parent | parent = term.getParent() |
25-
parent instanceof RegExpAlt
26-
or
27-
parent instanceof RegExpQuantifier
28-
or
29-
parent instanceof RegExpSubPattern
30-
or
31-
isInsideChoiceOrSubPattern(parent)
32-
)
33-
}
34-
35-
/**
36-
* Holds if `group` is likely to be used as a capture group.
37-
*/
38-
predicate isLikelyCaptureGroup(RegExpGroup group) {
39-
group.isCapture() and
40-
not isInsideChoiceOrSubPattern(group)
41-
}
42-
43-
/**
44-
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
45-
*
46-
* At least one of these dots is not intended to be a subdomain separator,
47-
* so we avoid flagging the pattern in this case.
48-
*/
49-
predicate hasConsecutiveDots(RegExpSequence seq) {
50-
exists(int i |
51-
isDotLike(seq.getChild(i)) and
52-
isDotLike(seq.getChild(i + 1))
53-
)
54-
}
55-
56-
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
57-
seq = regexp.getAChild*() and
58-
exists(RegExpDot unescapedDot, int i, string hostname |
59-
hasTopLevelDomainEnding(seq, i) and
60-
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
61-
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
62-
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
63-
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
64-
not hasConsecutiveDots(unescapedDot.getParent()) and
65-
hostname =
66-
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
67-
seq.getChild(i).getRawValue()
68-
|
69-
if unescapedDot.getParent() instanceof RegExpQuantifier
70-
then
71-
// `.*\.example.com` can match `evil.com/?x=.example.com`
72-
//
73-
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
74-
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
75-
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
76-
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
77-
seq.getChild(0) instanceof RegExpCaret and
78-
not seq.getAChild() instanceof RegExpDollar and
79-
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
80-
msg =
81-
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
82-
+ "' which may cause '" + hostname +
83-
"' to be matched anywhere in the URL, outside the hostname."
84-
else
85-
msg =
86-
"has an unescaped '.' before '" + hostname +
87-
"', so it might match more hosts than expected."
88-
)
89-
}
90-
91-
from
92-
RegExpPatternSource re, RegExpTerm regexp, RegExpSequence hostSequence, string msg, string kind,
93-
DataFlow::Node aux
94-
where
95-
regexp = re.getRegExpTerm() and
96-
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
97-
(
98-
if re.getAParse() != re
99-
then (
100-
kind = "string, which is used as a regular expression $@," and
101-
aux = re.getAParse()
102-
) else (
103-
kind = "regular expression" and aux = re
104-
)
105-
) and
106-
not CharacterEscapes::hasALikelyRegExpPatternMistake(re)
107-
select hostSequence, "This " + kind + " " + msg, aux, "here"
16+
query predicate problems = incompleteHostnameRegExp/4;

ruby/ql/lib/codeql/ruby/security/performance/RegExpTreeView.qll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ class RegExpAlt extends RegExpTerm, TRegExpAlt {
398398
override string getAPrimaryQlClass() { result = "RegExpAlt" }
399399
}
400400

401+
class RegExpCharEscape = RegExpEscape;
402+
401403
class RegExpEscape extends RegExpNormalChar {
402404
RegExpEscape() { re.escapedCharacter(start, end) }
403405

ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll

Lines changed: 95 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
* that match URLs and hostname patterns.
44
*/
55

6-
private import codeql.ruby.security.performance.RegExpTreeView
7-
private import codeql.ruby.DataFlow
6+
private import HostnameRegexpSpecific
87

98
/**
109
* Holds if the given constant is unlikely to occur in the origin part of a URL.
@@ -20,7 +19,7 @@ predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
2019

2120
/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
2221
predicate isDotConstant(RegExpTerm term) {
23-
term.(RegExpEscape).getValue() = "."
22+
term.(RegExpCharEscape).getValue() = "."
2423
or
2524
exists(RegExpCharacterClass cls |
2625
term = cls and
@@ -108,3 +107,96 @@ predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
108107
alwaysMatchesHostnameAlt(alt, i - 1) and
109108
alwaysMatchesHostname(alt.getChild(i))
110109
}
110+
111+
/**
112+
* Holds if `term` occurs inside a quantifier or alternative (and thus
113+
* can not be expected to correspond to a unique match), or as part of
114+
* a lookaround assertion (which are rarely used for capture groups).
115+
*/
116+
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
117+
exists(RegExpParent parent | parent = term.getParent() |
118+
parent instanceof RegExpAlt
119+
or
120+
parent instanceof RegExpQuantifier
121+
or
122+
parent instanceof RegExpSubPattern
123+
or
124+
isInsideChoiceOrSubPattern(parent)
125+
)
126+
}
127+
128+
/**
129+
* Holds if `group` is likely to be used as a capture group.
130+
*/
131+
predicate isLikelyCaptureGroup(RegExpGroup group) {
132+
group.isCapture() and
133+
not isInsideChoiceOrSubPattern(group)
134+
}
135+
136+
/**
137+
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
138+
*
139+
* At least one of these dots is not intended to be a subdomain separator,
140+
* so we avoid flagging the pattern in this case.
141+
*/
142+
predicate hasConsecutiveDots(RegExpSequence seq) {
143+
exists(int i |
144+
isDotLike(seq.getChild(i)) and
145+
isDotLike(seq.getChild(i + 1))
146+
)
147+
}
148+
149+
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
150+
seq = regexp.getAChild*() and
151+
exists(RegExpDot unescapedDot, int i, string hostname |
152+
hasTopLevelDomainEnding(seq, i) and
153+
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
154+
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
155+
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
156+
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
157+
not hasConsecutiveDots(unescapedDot.getParent()) and
158+
hostname =
159+
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
160+
seq.getChild(i).getRawValue()
161+
|
162+
if unescapedDot.getParent() instanceof RegExpQuantifier
163+
then
164+
// `.*\.example.com` can match `evil.com/?x=.example.com`
165+
//
166+
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167+
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168+
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169+
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170+
seq.getChild(0) instanceof RegExpCaret and
171+
not seq.getAChild() instanceof RegExpDollar and
172+
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
173+
msg =
174+
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
175+
+ "' which may cause '" + hostname +
176+
"' to be matched anywhere in the URL, outside the hostname."
177+
else
178+
msg =
179+
"has an unescaped '.' before '" + hostname +
180+
"', so it might match more hosts than expected."
181+
)
182+
}
183+
184+
predicate incompleteHostnameRegExp(
185+
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
186+
) {
187+
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
188+
regexp = re.getRegExpTerm() and
189+
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
190+
(
191+
if re.getAParse() != re
192+
then (
193+
kind = "string, which is used as a regular expression $@," and
194+
aux = re.getAParse()
195+
) else (
196+
kind = "regular expression" and aux = re
197+
)
198+
)
199+
|
200+
message = "This " + kind + " " + msg and label = "here"
201+
)
202+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
import codeql.ruby.security.performance.RegExpTreeView
2+
import codeql.ruby.DataFlow

0 commit comments

Comments
 (0)