|
3 | 3 | * that match URLs and hostname patterns.
|
4 | 4 | */
|
5 | 5 |
|
6 |
| -private import HostnameRegexpSpecific |
7 |
| - |
8 |
| -/** |
9 |
| - * Holds if the given constant is unlikely to occur in the origin part of a URL. |
10 |
| - */ |
11 |
| -predicate isConstantInvalidInsideOrigin(RegExpConstant term) { |
12 |
| - // Look for any of these cases: |
13 |
| - // - A character that can't occur in the origin |
14 |
| - // - Two dashes in a row |
15 |
| - // - A colon that is not part of port or scheme separator |
16 |
| - // - A slash that is not part of scheme separator |
17 |
| - term.getValue().regexpMatch(".*(?:[^a-zA-Z0-9.:/-]|--|:[^0-9/]|(?<![/:]|^)/).*") |
18 |
| -} |
19 |
| - |
20 |
| -/** Holds if `term` is a dot constant of form `\.` or `[.]`. */ |
21 |
| -predicate isDotConstant(RegExpTerm term) { |
22 |
| - term.(RegExpCharEscape).getValue() = "." |
23 |
| - or |
24 |
| - exists(RegExpCharacterClass cls | |
25 |
| - term = cls and |
26 |
| - not cls.isInverted() and |
27 |
| - cls.getNumChild() = 1 and |
28 |
| - cls.getAChild().(RegExpConstant).getValue() = "." |
29 |
| - ) |
30 |
| -} |
31 |
| - |
32 |
| -/** Holds if `term` is a wildcard `.` or an actual `.` character. */ |
33 |
| -predicate isDotLike(RegExpTerm term) { |
34 |
| - term instanceof RegExpDot |
35 |
| - or |
36 |
| - isDotConstant(term) |
37 |
| -} |
38 |
| - |
39 |
| -/** Holds if `term` will only ever be matched against the beginning of the input. */ |
40 |
| -predicate matchesBeginningOfString(RegExpTerm term) { |
41 |
| - term.isRootTerm() |
42 |
| - or |
43 |
| - exists(RegExpTerm parent | matchesBeginningOfString(parent) | |
44 |
| - term = parent.(RegExpSequence).getChild(0) |
45 |
| - or |
46 |
| - parent.(RegExpSequence).getChild(0) instanceof RegExpCaret and |
47 |
| - term = parent.(RegExpSequence).getChild(1) |
48 |
| - or |
49 |
| - term = parent.(RegExpAlt).getAChild() |
50 |
| - or |
51 |
| - term = parent.(RegExpGroup).getAChild() |
52 |
| - ) |
53 |
| -} |
54 |
| - |
55 |
| -/** |
56 |
| - * Holds if the given sequence `seq` contains top-level domain preceded by a dot, such as `.com`, |
57 |
| - * excluding cases where this is at the very beginning of the regexp. |
58 |
| - * |
59 |
| - * `i` is bound to the index of the last child in the top-level domain part. |
60 |
| - */ |
61 |
| -predicate hasTopLevelDomainEnding(RegExpSequence seq, int i) { |
62 |
| - seq.getChild(i) |
63 |
| - .(RegExpConstant) |
64 |
| - .getValue() |
65 |
| - .regexpMatch("(?i)" + RegExpPatterns::getACommonTld() + "(:\\d+)?([/?#].*)?") and |
66 |
| - isDotLike(seq.getChild(i - 1)) and |
67 |
| - not (i = 1 and matchesBeginningOfString(seq)) |
68 |
| -} |
69 |
| - |
70 |
| -/** |
71 |
| - * Holds if the given regular expression term contains top-level domain preceded by a dot, |
72 |
| - * such as `.com`. |
73 |
| - */ |
74 |
| -predicate hasTopLevelDomainEnding(RegExpSequence seq) { hasTopLevelDomainEnding(seq, _) } |
75 |
| - |
76 |
| -/** |
77 |
| - * Holds if `term` will always match a hostname, that is, all disjunctions contain |
78 |
| - * a hostname pattern that isn't inside a quantifier. |
79 |
| - */ |
80 |
| -predicate alwaysMatchesHostname(RegExpTerm term) { |
81 |
| - hasTopLevelDomainEnding(term, _) |
82 |
| - or |
83 |
| - // `localhost` is considered a hostname pattern, but has no TLD |
84 |
| - term.(RegExpConstant).getValue().regexpMatch("\\blocalhost\\b") |
85 |
| - or |
86 |
| - not term instanceof RegExpAlt and |
87 |
| - not term instanceof RegExpQuantifier and |
88 |
| - alwaysMatchesHostname(term.getAChild()) |
89 |
| - or |
90 |
| - alwaysMatchesHostnameAlt(term) |
91 |
| -} |
92 |
| - |
93 |
| -/** Holds if every child of `alt` contains a hostname pattern. */ |
94 |
| -predicate alwaysMatchesHostnameAlt(RegExpAlt alt) { |
95 |
| - alwaysMatchesHostnameAlt(alt, alt.getNumChild() - 1) |
96 |
| -} |
97 |
| - |
98 |
| -/** |
99 |
| - * Holds if the first `i` children of `alt` contains a hostname pattern. |
100 |
| - * |
101 |
| - * This is used instead of `forall` to avoid materializing the set of alternatives |
102 |
| - * that don't contains hostnames, which is much larger. |
103 |
| - */ |
104 |
| -predicate alwaysMatchesHostnameAlt(RegExpAlt alt, int i) { |
105 |
| - alwaysMatchesHostname(alt.getChild(0)) and i = 0 |
106 |
| - or |
107 |
| - alwaysMatchesHostnameAlt(alt, i - 1) and |
108 |
| - alwaysMatchesHostname(alt.getChild(i)) |
109 |
| -} |
110 |
| - |
111 |
| -/** |
112 |
| - * Holds if `term` occurs inside a quantifier or alternative (and thus |
113 |
| - * can not be expected to correspond to a unique match), or as part of |
114 |
| - * a lookaround assertion (which are rarely used for capture groups). |
115 |
| - */ |
116 |
| -predicate isInsideChoiceOrSubPattern(RegExpTerm term) { |
117 |
| - exists(RegExpParent parent | parent = term.getParent() | |
118 |
| - parent instanceof RegExpAlt |
119 |
| - or |
120 |
| - parent instanceof RegExpQuantifier |
121 |
| - or |
122 |
| - parent instanceof RegExpSubPattern |
123 |
| - or |
124 |
| - isInsideChoiceOrSubPattern(parent) |
125 |
| - ) |
126 |
| -} |
127 |
| - |
128 |
| -/** |
129 |
| - * Holds if `group` is likely to be used as a capture group. |
130 |
| - */ |
131 |
| -predicate isLikelyCaptureGroup(RegExpGroup group) { |
132 |
| - group.isCapture() and |
133 |
| - not isInsideChoiceOrSubPattern(group) |
134 |
| -} |
135 |
| - |
136 |
| -/** |
137 |
| - * Holds if `seq` contains two consecutive dots `..` or escaped dots. |
138 |
| - * |
139 |
| - * At least one of these dots is not intended to be a subdomain separator, |
140 |
| - * so we avoid flagging the pattern in this case. |
141 |
| - */ |
142 |
| -predicate hasConsecutiveDots(RegExpSequence seq) { |
143 |
| - exists(int i | |
144 |
| - isDotLike(seq.getChild(i)) and |
145 |
| - isDotLike(seq.getChild(i + 1)) |
146 |
| - ) |
147 |
| -} |
148 |
| - |
149 |
| -predicate isIncompleteHostNameRegExpPattern(RegExpTerm regexp, RegExpSequence seq, string msg) { |
150 |
| - seq = regexp.getAChild*() and |
151 |
| - exists(RegExpDot unescapedDot, int i, string hostname | |
152 |
| - hasTopLevelDomainEnding(seq, i) and |
153 |
| - not isConstantInvalidInsideOrigin(seq.getChild([0 .. i - 1]).getAChild*()) and |
154 |
| - not isLikelyCaptureGroup(seq.getChild([i .. seq.getNumChild() - 1]).getAChild*()) and |
155 |
| - unescapedDot = seq.getChild([0 .. i - 1]).getAChild*() and |
156 |
| - unescapedDot != seq.getChild(i - 1) and // Should not be the '.' immediately before the TLD |
157 |
| - not hasConsecutiveDots(unescapedDot.getParent()) and |
158 |
| - hostname = |
159 |
| - seq.getChild(i - 2).getRawValue() + seq.getChild(i - 1).getRawValue() + |
160 |
| - seq.getChild(i).getRawValue() |
161 |
| - | |
162 |
| - if unescapedDot.getParent() instanceof RegExpQuantifier |
163 |
| - then |
164 |
| - // `.*\.example.com` can match `evil.com/?x=.example.com` |
165 |
| - // |
166 |
| - // This problem only occurs when the pattern is applied against a full URL, not just a hostname/origin. |
167 |
| - // We therefore check if the pattern includes a suffix after the TLD, such as `.*\.example.com/`. |
168 |
| - // Note that a post-anchored pattern (`.*\.example.com$`) will usually fail to match a full URL, |
169 |
| - // and patterns with neither a suffix nor an anchor fall under the purview of MissingRegExpAnchor. |
170 |
| - seq.getChild(0) instanceof RegExpCaret and |
171 |
| - not seq.getAChild() instanceof RegExpDollar and |
172 |
| - seq.getChild([i .. i + 1]).(RegExpConstant).getValue().regexpMatch(".*[/?#].*") and |
173 |
| - msg = |
174 |
| - "has an unrestricted wildcard '" + unescapedDot.getParent().(RegExpQuantifier).getRawValue() |
175 |
| - + "' which may cause '" + hostname + |
176 |
| - "' to be matched anywhere in the URL, outside the hostname." |
177 |
| - else |
178 |
| - msg = |
179 |
| - "has an unescaped '.' before '" + hostname + |
180 |
| - "', so it might match more hosts than expected." |
181 |
| - ) |
182 |
| -} |
183 |
| - |
184 |
| -predicate incompleteHostnameRegExp( |
185 |
| - RegExpSequence hostSequence, string message, DataFlow::Node aux, string label |
186 |
| -) { |
187 |
| - exists(RegExpPatternSource re, RegExpTerm regexp, string msg, string kind | |
188 |
| - regexp = re.getRegExpTerm() and |
189 |
| - isIncompleteHostNameRegExpPattern(regexp, hostSequence, msg) and |
190 |
| - ( |
191 |
| - if re.getAParse() != re |
192 |
| - then ( |
193 |
| - kind = "string, which is used as a regular expression $@," and |
194 |
| - aux = re.getAParse() |
195 |
| - ) else ( |
196 |
| - kind = "regular expression" and aux = re |
197 |
| - ) |
198 |
| - ) |
199 |
| - | |
200 |
| - message = "This " + kind + " " + msg and label = "here" |
201 |
| - ) |
202 |
| -} |
| 6 | +deprecated import semmle.javascript.security.regexp.HostnameRegexp as Dep |
| 7 | +import Dep |
0 commit comments