|
12 | 12 | */
|
13 | 13 |
|
14 | 14 | private import javascript
|
15 |
| -private import semmle.javascript.security.regexp.HostnameRegexp |
16 |
| - |
17 |
| -// TODO: Share the below code. |
18 |
| -/** |
19 |
| - * Holds if `term` is an anchor that is not the first or last node |
20 |
| - * in its tree. |
21 |
| - */ |
22 |
| -predicate isInteriorAnchor(RegExpAnchor term) { |
23 |
| - not isLeftArmTerm(term) and |
24 |
| - not isRightArmTerm(term) |
25 |
| -} |
26 |
| - |
27 |
| -/** |
28 |
| - * Holds if `term` contains an anchor that is not the first or last node |
29 |
| - * in its tree, such as `(foo|bar$|baz)`. |
30 |
| - */ |
31 |
| -predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) } |
32 |
| - |
33 |
| -/** |
34 |
| - * Holds if `term` starts with a word boundary or lookbehind assertion, |
35 |
| - * indicating that it's not intended to be anchored on that side. |
36 |
| - */ |
37 |
| -predicate containsLeadingPseudoAnchor(RegExpSequence term) { |
38 |
| - exists(RegExpTerm child | child = term.getChild(0) | |
39 |
| - child instanceof RegExpWordBoundary or |
40 |
| - child instanceof RegExpNonWordBoundary or |
41 |
| - child instanceof RegExpLookbehind |
42 |
| - ) |
43 |
| -} |
44 |
| - |
45 |
| -/** |
46 |
| - * Holds if `term` ends with a word boundary or lookahead assertion, |
47 |
| - * indicating that it's not intended to be anchored on that side. |
48 |
| - */ |
49 |
| -predicate containsTrailingPseudoAnchor(RegExpSequence term) { |
50 |
| - exists(RegExpTerm child | child = term.getLastChild() | |
51 |
| - child instanceof RegExpWordBoundary or |
52 |
| - child instanceof RegExpNonWordBoundary or |
53 |
| - child instanceof RegExpLookahead |
54 |
| - ) |
55 |
| -} |
56 |
| - |
57 |
| -/** |
58 |
| - * Holds if `term` is an empty sequence, usually arising from |
59 |
| - * literals with a trailing alternative such as `foo|`. |
60 |
| - */ |
61 |
| -predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 } |
62 |
| - |
63 |
| -/** |
64 |
| - * Holds if `term` contains a letter constant. |
65 |
| - * |
66 |
| - * We use this as a heuristic to filter out uninteresting results. |
67 |
| - */ |
68 |
| -predicate containsLetters(RegExpTerm term) { |
69 |
| - term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*") |
70 |
| -} |
71 |
| - |
72 |
| -/** |
73 |
| - * Holds if `term` consists only of an anchor and a parenthesized term, |
74 |
| - * such as the left side of `^(foo|bar)|baz`. |
75 |
| - * |
76 |
| - * The precedence of the anchor is likely to be intentional in this case, |
77 |
| - * as the group wouldn't be needed otherwise. |
78 |
| - */ |
79 |
| -predicate isAnchoredGroup(RegExpSequence term) { |
80 |
| - term.getNumChild() = 2 and |
81 |
| - term.getAChild() instanceof RegExpAnchor and |
82 |
| - term.getAChild() instanceof RegExpGroup |
83 |
| -} |
84 |
| - |
85 |
| -/** |
86 |
| - * Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz` |
87 |
| - * and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`. |
88 |
| - */ |
89 |
| -predicate hasExplicitAnchorPrecedence(RegExpAlt alt) { |
90 |
| - isAnchoredGroup(alt.getAChild()) and |
91 |
| - not alt.getAChild() instanceof RegExpGroup |
92 |
| -} |
93 |
| - |
94 |
| -/** |
95 |
| - * Holds if `src` is a pattern for a collection of alternatives where |
96 |
| - * only the first or last alternative is anchored, indicating a |
97 |
| - * precedence mistake explained by `msg`. |
98 |
| - * |
99 |
| - * The canonical example of such a mistake is: `^a|b|c`, which is |
100 |
| - * parsed as `(^a)|(b)|(c)`. |
101 |
| - */ |
102 |
| -predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) { |
103 |
| - exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction | |
104 |
| - root = src.getRegExpTerm() and |
105 |
| - not containsInteriorAnchor(root) and |
106 |
| - not isEmpty(root.getAChild()) and |
107 |
| - not hasExplicitAnchorPrecedence(root) and |
108 |
| - containsLetters(anchoredTerm) and |
109 |
| - ( |
110 |
| - anchoredTerm = root.getChild(0) and |
111 |
| - anchoredTerm.getChild(0) instanceof RegExpCaret and |
112 |
| - not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and |
113 |
| - containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and |
114 |
| - direction = "beginning" |
115 |
| - or |
116 |
| - anchoredTerm = root.getLastChild() and |
117 |
| - anchoredTerm.getLastChild() instanceof RegExpDollar and |
118 |
| - not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and |
119 |
| - containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and |
120 |
| - direction = "end" |
121 |
| - ) and |
122 |
| - // is not used for replace |
123 |
| - not exists(DataFlow::MethodCallNode replace | |
124 |
| - replace.getMethodName() = "replace" and |
125 |
| - src.getARegExpObject().flowsTo(replace.getArgument(0)) |
126 |
| - ) and |
127 |
| - msg = |
128 |
| - "Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() + |
129 |
| - "' is anchored at the " + direction + |
130 |
| - ", but the other parts of this regular expression are not" |
131 |
| - ) |
132 |
| -} |
133 |
| - |
134 |
| -/** |
135 |
| - * Holds if `term` is a final term, that is, no term will match anything after this one. |
136 |
| - */ |
137 |
| -predicate isFinalRegExpTerm(RegExpTerm term) { |
138 |
| - term.isRootTerm() |
139 |
| - or |
140 |
| - exists(RegExpSequence seq | |
141 |
| - isFinalRegExpTerm(seq) and |
142 |
| - term = seq.getLastChild() |
143 |
| - ) |
144 |
| - or |
145 |
| - exists(RegExpTerm parent | |
146 |
| - isFinalRegExpTerm(parent) and |
147 |
| - term = parent.getAChild() and |
148 |
| - not parent instanceof RegExpSequence and |
149 |
| - not parent instanceof RegExpQuantifier |
150 |
| - ) |
151 |
| -} |
152 |
| - |
153 |
| -/** |
154 |
| - * Holds if `src` contains a hostname pattern that is missing a `$` anchor. |
155 |
| - */ |
156 |
| -predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) { |
157 |
| - not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting |
158 |
| - exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() | |
159 |
| - not isConstantInvalidInsideOrigin(term.getAChild*()) and |
160 |
| - tld = term.getAChild*() and |
161 |
| - hasTopLevelDomainEnding(tld, i) and |
162 |
| - isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD |
163 |
| - tld.getChild(0) instanceof RegExpCaret and |
164 |
| - msg = |
165 |
| - "This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end." |
166 |
| - ) |
167 |
| -} |
168 |
| - |
169 |
| -/** |
170 |
| - * Holds if `src` is an unanchored pattern for a URL, indicating a |
171 |
| - * mistake explained by `msg`. |
172 |
| - */ |
173 |
| -predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) { |
174 |
| - exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() | |
175 |
| - alwaysMatchesHostname(term) and |
176 |
| - tld = term.getAChild*() and |
177 |
| - hasTopLevelDomainEnding(tld) and |
178 |
| - not isConstantInvalidInsideOrigin(term.getAChild*()) and |
179 |
| - not term.getAChild*() instanceof RegExpAnchor and |
180 |
| - // that is not used for capture or replace |
181 |
| - not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() | |
| 15 | +private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp |
| 16 | +private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor |
| 17 | +private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl |
| 18 | + |
| 19 | +private module Impl implements |
| 20 | +MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> { |
| 21 | + predicate isUsedAsReplace(RegExpPatternSource pattern) { |
| 22 | + // is used for capture or replace |
| 23 | + exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() | |
182 | 24 | name = "exec" and
|
183 |
| - mcn = src.getARegExpObject().getAMethodCall() and |
| 25 | + mcn = pattern.getARegExpObject().getAMethodCall() and |
184 | 26 | exists(mcn.getAPropertyRead())
|
185 | 27 | or
|
186 | 28 | exists(DataFlow::Node arg |
|
187 | 29 | arg = mcn.getArgument(0) and
|
188 | 30 | (
|
189 |
| - src.getARegExpObject().flowsTo(arg) or |
190 |
| - src.getAParse() = arg |
| 31 | + pattern.getARegExpObject().flowsTo(arg) or |
| 32 | + pattern.getAParse() = arg |
191 | 33 | )
|
192 | 34 | |
|
193 | 35 | name = "replace"
|
194 | 36 | or
|
195 | 37 | name = "match" and exists(mcn.getAPropertyRead())
|
196 | 38 | )
|
197 |
| - ) and |
198 |
| - msg = |
199 |
| - "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it." |
200 |
| - ) |
| 39 | + ) |
| 40 | + } |
| 41 | + |
| 42 | + string getEndAnchorText() { result = "$" } |
201 | 43 | }
|
202 | 44 |
|
| 45 | +import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl> |
| 46 | + |
203 | 47 | from DataFlow::Node nd, string msg
|
204 | 48 | where
|
205 | 49 | isUnanchoredHostnameRegExp(nd, msg)
|
206 | 50 | or
|
207 | 51 | isSemiAnchoredHostnameRegExp(nd, msg)
|
208 | 52 | or
|
209 | 53 | hasMisleadingAnchorPrecedence(nd, msg)
|
| 54 | +// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS. |
210 | 55 | select nd, msg
|
0 commit comments