Skip to content

Commit d4eb2b9

Browse files
authored
Merge pull request #11699 from erik-krogh/shareHost
Dynamic: Share more regexp code
2 parents 31f7702 + 6c8b1cf commit d4eb2b9

28 files changed

+754
-1045
lines changed

config/identical-files.json

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -531,11 +531,6 @@
531531
"ruby/ql/lib/codeql/ruby/internal/ConceptsShared.qll",
532532
"javascript/ql/lib/semmle/javascript/internal/ConceptsShared.qll"
533533
],
534-
"Hostname Regexp queries": [
535-
"javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
536-
"python/ql/src/Security/CWE-020/HostnameRegexpShared.qll",
537-
"ruby/ql/src/queries/security/cwe-020/HostnameRegexpShared.qll"
538-
],
539534
"ApiGraphModels": [
540535
"javascript/ql/lib/semmle/javascript/frameworks/data/internal/ApiGraphModels.qll",
541536
"ruby/ql/lib/codeql/ruby/frameworks/data/internal/ApiGraphModels.qll",

java/ql/lib/semmle/code/java/regex/RegexTreeView.qll

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ module Impl implements RegexTreeViewSig {
8585

8686
/** Gets the associated regex. */
8787
abstract Regex getRegex();
88+
89+
/** Gets the last child term of this element. */
90+
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
8891
}
8992

9093
/**
@@ -558,13 +561,31 @@ module Impl implements RegexTreeViewSig {
558561
}
559562
}
560563

564+
/**
565+
* A character escape in a regular expression.
566+
*
567+
* Example:
568+
*
569+
* ```
570+
* \.
571+
* ```
572+
*/
573+
class RegExpCharEscape = RegExpEscape;
574+
561575
/**
562576
* A word boundary, that is, a regular expression term of the form `\b`.
563577
*/
564578
class RegExpWordBoundary extends RegExpSpecialChar {
565579
RegExpWordBoundary() { this.getChar() = "\\b" }
566580
}
567581

582+
/**
583+
* A non-word boundary, that is, a regular expression term of the form `\B`.
584+
*/
585+
class RegExpNonWordBoundary extends RegExpSpecialChar {
586+
RegExpNonWordBoundary() { this.getChar() = "\\B" }
587+
}
588+
568589
/**
569590
* Gets the hex number for the `hex` char.
570591
*/
@@ -868,6 +889,9 @@ module Impl implements RegexTreeViewSig {
868889
predicate isNamedGroupOfLiteral(RegExpLiteral lit, string name) {
869890
lit = this.getLiteral() and name = this.getName()
870891
}
892+
893+
/** Holds if this is a capture group. */
894+
predicate isCapture() { exists(this.getNumber()) }
871895
}
872896

873897
/**
@@ -917,6 +941,21 @@ module Impl implements RegexTreeViewSig {
917941
override string getPrimaryQLClass() { result = "RegExpDot" }
918942
}
919943

944+
/**
945+
* A term that matches a specific position between characters in the string.
946+
*
947+
* Example:
948+
*
949+
* ```
950+
* ^
951+
* ```
952+
*/
953+
class RegExpAnchor extends RegExpSpecialChar {
954+
RegExpAnchor() { this.getChar() = ["$", "^"] }
955+
956+
override string getPrimaryQLClass() { result = "RegExpAnchor" }
957+
}
958+
920959
/**
921960
* A dollar assertion `$` matching the end of a line.
922961
*
@@ -926,7 +965,7 @@ module Impl implements RegexTreeViewSig {
926965
* $
927966
* ```
928967
*/
929-
class RegExpDollar extends RegExpSpecialChar {
968+
class RegExpDollar extends RegExpAnchor {
930969
RegExpDollar() { this.getChar() = "$" }
931970

932971
override string getPrimaryQLClass() { result = "RegExpDollar" }
@@ -941,7 +980,7 @@ module Impl implements RegexTreeViewSig {
941980
* ^
942981
* ```
943982
*/
944-
class RegExpCaret extends RegExpSpecialChar {
983+
class RegExpCaret extends RegExpAnchor {
945984
RegExpCaret() { this.getChar() = "^" }
946985

947986
override string getPrimaryQLClass() { result = "RegExpCaret" }

javascript/ql/lib/semmle/javascript/Regexp.qll

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
366366
override predicate isNullable() { any() }
367367

368368
override string getAPrimaryQlClass() { result = "RegExpAnchor" }
369+
370+
/** Gets the char for this term. */
371+
abstract string getChar();
369372
}
370373

371374
/**
@@ -379,6 +382,8 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
379382
*/
380383
class RegExpCaret extends RegExpAnchor, @regexp_caret {
381384
override string getAPrimaryQlClass() { result = "RegExpCaret" }
385+
386+
override string getChar() { result = "^" }
382387
}
383388

384389
/**
@@ -392,6 +397,8 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
392397
*/
393398
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
394399
override string getAPrimaryQlClass() { result = "RegExpDollar" }
400+
401+
override string getChar() { result = "$" }
395402
}
396403

397404
/**
@@ -999,11 +1006,12 @@ predicate isInterpretedAsRegExp(DataFlow::Node source) {
9991006
/**
10001007
* Provides utility predicates related to regular expressions.
10011008
*/
1002-
module RegExpPatterns {
1009+
deprecated module RegExpPatterns {
10031010
/**
10041011
* Gets a pattern that matches common top-level domain names in lower case.
1012+
* DEPRECATED: use the similarly named predicate from `HostnameRegex` from the `regex` pack instead.
10051013
*/
1006-
string getACommonTld() {
1014+
deprecated string getACommonTld() {
10071015
// according to ranking by http://google.com/search?q=site:.<<TLD>>
10081016
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
10091017
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
/**
2+
* Provides predicates for reasoning about regular expressions
3+
* that match URLs and hostname patterns.
4+
*/
5+
6+
private import javascript as JS
7+
private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl
8+
private import semmle.javascript.Regexp as RegExp
9+
private import codeql.regex.HostnameRegexp as Shared
10+
11+
/** An implementation of the signature that allows the Hostname analysis to run. */
12+
module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
13+
class DataFlowNode = JS::DataFlow::Node;
14+
15+
class RegExpPatternSource = RegExp::RegExpPatternSource;
16+
}
17+
18+
import Shared::Make<TreeImpl, Impl>

javascript/ql/src/Security/CWE-020/HostnameRegexpShared.qll

Lines changed: 2 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -3,200 +3,5 @@
33
* that match URLs and hostname patterns.
44
*/
55

6-
private import HostnameRegexpSpecific
7-
8-
/**
9-
* Holds if the given constant is unlikely to occur in the origin part of a URL.
10-
*/
11-
predicate isConstantInvalidInsideOrigin(RegExpConstant term) {
12-
// Look for any of these cases:
13-
// - A character that can't occur in the origin
14-
// - Two dashes in a row
15-
// - A colon that is not part of port or scheme separator
16-
// - A slash that is not part of scheme separator
17-
term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*")
18-
}
19-
20-
/** Holds if `term` is a dot constant of form `\.` or `[.]`. */
21-
predicate isDotConstant(RegExpTerm term) {
22-
term.(RegExpCharEscape).getValue() = "."
23-
or
24-
exists(RegExpCharacterClass cls |
25-
term = cls and
26-
not cls.isInverted() and
27-
cls.getNumChild() = 1 and
28-
cls.getAChild().(RegExpConstant).getValue() = "."
29-
)
30-
}
31-
32-
/** Holds if `term` is a wildcard `.` or an actual `.` character. */
33-
predicate isDotLike(RegExpTerm term) {
34-
term instanceof RegExpDot
35-
or
36-
isDotConstant(term)
37-
}
38-
39-
/** Holds if `term` will only ever be matched against the beginning of the input. */
40-
predicate matchesBeginningOfString(RegExpTerm term) {
41-
term.isRootTerm()
42-
or
43-
exists(RegExpTerm parent | matchesBeginningOfString(parent) |
44-
term = parent.(RegExpSequence).getChild(0)
45-
or
46-
parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and
47-
term = parent.(RegExpSequence).getChild(1)
48-
or
49-
term = parent.(RegExpAlt).getAChild()
50-
or
51-
term = parent.(RegExpGroup).getAChild()
52-
)
53-
}
54-
55-
/**
56-
* Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`,
57-
* excluding cases where this is at the very beginning of the regexp.
58-
*
59-
* `i` is bound to the index of the last child in the top-level domain part.
60-
*/
61-
predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) {
62-
seq.getChild(i)
63-
.(RegExpConstant)
64-
.getValue()
65-
.regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and
66-
isDotLike(seq.getChild(i - 1)) and
67-
not (i = 1 and matchesBeginningOfString(seq))
68-
}
69-
70-
/**
71-
* Holds if the given regular expression term contains top-level domain preceded by a dot,
72-
* such as `.com`.
73-
*/
74-
predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) }
75-
76-
/**
77-
* Holds if `term` will always match a hostname, that is, all disjunctions contain
78-
* a hostname pattern that isn't inside a quantifier.
79-
*/
80-
predicate alwaysMatchesHostname(RegExpTerm term) {
81-
hasTopLevelDomainEnding(term, _)
82-
or
83-
// `localhost` is considered a hostname pattern, but has no TLD
84-
term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b")
85-
or
86-
not term instanceof RegExpAlt and
87-
not term instanceof RegExpQuantifier and
88-
alwaysMatchesHostname(term.getAChild())
89-
or
90-
alwaysMatchesHostnameAlt(term)
91-
}
92-
93-
/** Holds if every child of `alt` contains a hostname pattern. */
94-
predicate alwaysMatchesHostnameAlt(RegExpAlt alt) {
95-
alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1)
96-
}
97-
98-
/**
99-
* Holds if the first `i` children of `alt` contains a hostname pattern.
100-
*
101-
* This is used instead of `forall` to avoid materializing the set of alternatives
102-
* that don't contains hostnames, which is much larger.
103-
*/
104-
predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) {
105-
alwaysMatchesHostname(alt.getChild(0)) and i = 0
106-
or
107-
alwaysMatchesHostnameAlt(alt, i - 1) and
108-
alwaysMatchesHostname(alt.getChild(i))
109-
}
110-
111-
/**
112-
* Holds if `term` occurs inside a quantifier or alternative (and thus
113-
* can not be expected to correspond to a unique match), or as part of
114-
* a lookaround assertion (which are rarely used for capture groups).
115-
*/
116-
predicate isInsideChoiceOrSubPattern(RegExpTerm term) {
117-
exists(RegExpParent parent | parent = term.getParent() |
118-
parent instanceof RegExpAlt
119-
or
120-
parent instanceof RegExpQuantifier
121-
or
122-
parent instanceof RegExpSubPattern
123-
or
124-
isInsideChoiceOrSubPattern(parent)
125-
)
126-
}
127-
128-
/**
129-
* Holds if `group` is likely to be used as a capture group.
130-
*/
131-
predicate isLikelyCaptureGroup(RegExpGroup group) {
132-
group.isCapture() and
133-
not isInsideChoiceOrSubPattern(group)
134-
}
135-
136-
/**
137-
* Holds if `seq` contains two consecutive dots `..` or escaped dots.
138-
*
139-
* At least one of these dots is not intended to be a subdomain separator,
140-
* so we avoid flagging the pattern in this case.
141-
*/
142-
predicate hasConsecutiveDots(RegExpSequence seq) {
143-
exists(int i |
144-
isDotLike(seq.getChild(i)) and
145-
isDotLike(seq.getChild(i + 1))
146-
)
147-
}
148-
149-
predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) {
150-
seq = regexp.getAChild*() and
151-
exists(RegExpDot unescapedDot, int i, string hostname |
152-
hasTopLevelDomainEnding(seq, i) and
153-
not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and
154-
not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and
155-
unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and
156-
unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD
157-
not hasConsecutiveDots(unescapedDot.getParent()) and
158-
hostname =
159-
seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() +
160-
seq.getChild(i).getRawValue()
161-
|
162-
if unescapedDot.getParent() instanceof RegExpQuantifier
163-
then
164-
// `.*\.example.com` can match `evil.com/?x=.example.com`
165-
//
166-
// This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin.
167-
// We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`.
168-
// Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL,
169-
// and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor.
170-
seq.getChild(0) instanceof RegExpCaret and
171-
not seq.getAChild() instanceof RegExpDollar and
172-
seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and
173-
msg =
174-
"has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue()
175-
+ "' which may cause '" + hostname +
176-
"' to be matched anywhere in the URL, outside the hostname."
177-
else
178-
msg =
179-
"has an unescaped '.' before '" + hostname +
180-
"', so it might match more hosts than expected."
181-
)
182-
}
183-
184-
predicate incompleteHostnameRegExp(
185-
RegExpSequence hostSequence, string message, DataFlow::Node aux, string label
186-
) {
187-
exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind |
188-
regexp = re.getRegExpTerm() and
189-
isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and
190-
(
191-
if re.getAParse() != re
192-
then (
193-
kind = "string, which is used as a regular expression $@," and
194-
aux = re.getAParse()
195-
) else (
196-
kind = "regular expression" and aux = re
197-
)
198-
)
199-
|
200-
message = "This " + kind + " " + msg and label = "here"
201-
)
202-
}
6+
deprecated import semmle.javascript.security.regexp.HostnameRegexp as Dep
7+
import Dep

javascript/ql/src/Security/CWE-020/HostnameRegexpSpecific.qll

Lines changed: 0 additions & 1 deletion
This file was deleted.

javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,6 @@
1111
* external/cwe/cwe-020
1212
*/
1313

14-
import HostnameRegexpShared
14+
private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp
1515

16-
query predicate problems = incompleteHostnameRegExp/4;
16+
query predicate problems = HostnameRegexp::incompleteHostnameRegExp/4;

javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
*/
44

55
private import IncompleteUrlSubstringSanitizationSpecific
6+
private import codeql.regex.HostnameRegexp::Utils
67

78
/**
89
* A check on a string for whether it contains a given substring, possibly with restrictions on the location of the substring.
@@ -30,9 +31,7 @@ query predicate problems(
3031
mayHaveStringValue(substring, target) and
3132
(
3233
// target contains a domain on a common TLD, and perhaps some other URL components
33-
target
34-
.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + RegExpPatterns::getACommonTld() +
35-
"(:[0-9]+)?/?")
34+
target.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+" + getACommonTld() + "(:[0-9]+)?/?")
3635
or
3736
// target is a HTTP URL to a domain on any TLD
3837
target.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")

0 commit comments

Comments
 (0)