Skip to content

Commit 26c5480

Browse files
committed
share {js,rb}/regex/missing-regexp-anchor
1 parent 355499e commit 26c5480

File tree

10 files changed

+347
-385
lines changed

10 files changed

+347
-385
lines changed

java/ql/lib/semmle/code/java/regex/RegexTreeView.qll

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ module Impl implements RegexTreeViewSig {
8585

8686
/** Gets the associated regex. */
8787
abstract Regex getRegex();
88+
89+
/** Gets the last child term of this element. */
90+
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
8891
}
8992

9093
/**
@@ -567,6 +570,13 @@ module Impl implements RegexTreeViewSig {
567570
RegExpWordBoundary() { this.getChar() = "\\b" }
568571
}
569572

573+
/**
574+
* A non-word boundary, that is, a regular expression term of the form `\B`.
575+
*/
576+
class RegExpNonWordBoundary extends RegExpSpecialChar {
577+
RegExpNonWordBoundary() { this.getChar() = "\\B" }
578+
}
579+
570580
/**
571581
* Gets the hex number for the `hex` char.
572582
*/
@@ -922,6 +932,21 @@ module Impl implements RegexTreeViewSig {
922932
override string getPrimaryQLClass() { result = "RegExpDot" }
923933
}
924934

935+
/**
936+
* A term that matches a specific position between characters in the string.
937+
*
938+
* Example:
939+
*
940+
* ```
941+
* ^
942+
* ```
943+
*/
944+
class RegExpAnchor extends RegExpSpecialChar {
945+
RegExpAnchor() { this.getChar() = ["$", "^"] }
946+
947+
override string getPrimaryQLClass() { result = "RegExpAnchor" }
948+
}
949+
925950
/**
926951
* A dollar assertion `$` matching the end of a line.
927952
*
@@ -931,7 +956,7 @@ module Impl implements RegexTreeViewSig {
931956
* $
932957
* ```
933958
*/
934-
class RegExpDollar extends RegExpSpecialChar {
959+
class RegExpDollar extends RegExpAnchor {
935960
RegExpDollar() { this.getChar() = "$" }
936961

937962
override string getPrimaryQLClass() { result = "RegExpDollar" }
@@ -946,7 +971,7 @@ module Impl implements RegexTreeViewSig {
946971
* ^
947972
* ```
948973
*/
949-
class RegExpCaret extends RegExpSpecialChar {
974+
class RegExpCaret extends RegExpAnchor {
950975
RegExpCaret() { this.getChar() = "^" }
951976

952977
override string getPrimaryQLClass() { result = "RegExpCaret" }

javascript/ql/lib/semmle/javascript/Regexp.qll

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,9 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
366366
override predicate isNullable() { any() }
367367

368368
override string getAPrimaryQlClass() { result = "RegExpAnchor" }
369+
370+
/** Gets the char for this term. */
371+
abstract string getChar();
369372
}
370373

371374
/**
@@ -379,6 +382,8 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
379382
*/
380383
class RegExpCaret extends RegExpAnchor, @regexp_caret {
381384
override string getAPrimaryQlClass() { result = "RegExpCaret" }
385+
386+
override string getChar() { result = "^" }
382387
}
383388

384389
/**
@@ -392,6 +397,8 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
392397
*/
393398
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
394399
override string getAPrimaryQlClass() { result = "RegExpDollar" }
400+
401+
override string getChar() { result = "$" }
395402
}
396403

397404
/**

javascript/ql/lib/semmle/javascript/security/regexp/HostnameRegexp.qll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@ private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView
88
private import semmle.javascript.Regexp as RegExp
99
private import codeql.regex.HostnameRegexp as Shared
1010

11-
private module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
11+
/** An implementation of the signature that allows the Hostname analysis to run. */
12+
module Impl implements Shared::HostnameRegexpSig<TreeImpl> {
1213
class DataFlowNode = JS::DataFlow::Node;
1314

1415
class RegExpPatternSource = RegExp::RegExpPatternSource;

javascript/ql/src/Security/CWE-020/MissingRegExpAnchor.ql

Lines changed: 19 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -12,199 +12,44 @@
1212
*/
1313

1414
private import javascript
15-
private import semmle.javascript.security.regexp.HostnameRegexp
16-
17-
// TODO: Share the below code.
18-
/**
19-
* Holds if `term` is an anchor that is not the first or last node
20-
* in its tree.
21-
*/
22-
predicate isInteriorAnchor(RegExpAnchor term) {
23-
not isLeftArmTerm(term) and
24-
not isRightArmTerm(term)
25-
}
26-
27-
/**
28-
* Holds if `term` contains an anchor that is not the first or last node
29-
* in its tree, such as `(foo|bar$|baz)`.
30-
*/
31-
predicate containsInteriorAnchor(RegExpTerm term) { isInteriorAnchor(term.getAChild*()) }
32-
33-
/**
34-
* Holds if `term` starts with a word boundary or lookbehind assertion,
35-
* indicating that it's not intended to be anchored on that side.
36-
*/
37-
predicate containsLeadingPseudoAnchor(RegExpSequence term) {
38-
exists(RegExpTerm child | child = term.getChild(0) |
39-
child instanceof RegExpWordBoundary or
40-
child instanceof RegExpNonWordBoundary or
41-
child instanceof RegExpLookbehind
42-
)
43-
}
44-
45-
/**
46-
* Holds if `term` ends with a word boundary or lookahead assertion,
47-
* indicating that it's not intended to be anchored on that side.
48-
*/
49-
predicate containsTrailingPseudoAnchor(RegExpSequence term) {
50-
exists(RegExpTerm child | child = term.getLastChild() |
51-
child instanceof RegExpWordBoundary or
52-
child instanceof RegExpNonWordBoundary or
53-
child instanceof RegExpLookahead
54-
)
55-
}
56-
57-
/**
58-
* Holds if `term` is an empty sequence, usually arising from
59-
* literals with a trailing alternative such as `foo|`.
60-
*/
61-
predicate isEmpty(RegExpSequence term) { term.getNumChild() = 0 }
62-
63-
/**
64-
* Holds if `term` contains a letter constant.
65-
*
66-
* We use this as a heuristic to filter out uninteresting results.
67-
*/
68-
predicate containsLetters(RegExpTerm term) {
69-
term.getAChild*().(RegExpConstant).getValue().regexpMatch(".*[a-zA-Z].*")
70-
}
71-
72-
/**
73-
* Holds if `term` consists only of an anchor and a parenthesized term,
74-
* such as the left side of `^(foo|bar)|baz`.
75-
*
76-
* The precedence of the anchor is likely to be intentional in this case,
77-
* as the group wouldn't be needed otherwise.
78-
*/
79-
predicate isAnchoredGroup(RegExpSequence term) {
80-
term.getNumChild() = 2 and
81-
term.getAChild() instanceof RegExpAnchor and
82-
term.getAChild() instanceof RegExpGroup
83-
}
84-
85-
/**
86-
* Holds if `alt` has an explicitly anchored group, such as `^(foo|bar)|baz`
87-
* and doesn't have any unnecessary groups, such as in `^(foo)|(bar)`.
88-
*/
89-
predicate hasExplicitAnchorPrecedence(RegExpAlt alt) {
90-
isAnchoredGroup(alt.getAChild()) and
91-
not alt.getAChild() instanceof RegExpGroup
92-
}
93-
94-
/**
95-
* Holds if `src` is a pattern for a collection of alternatives where
96-
* only the first or last alternative is anchored, indicating a
97-
* precedence mistake explained by `msg`.
98-
*
99-
* The canonical example of such a mistake is: `^a|b|c`, which is
100-
* parsed as `(^a)|(b)|(c)`.
101-
*/
102-
predicate hasMisleadingAnchorPrecedence(RegExpPatternSource src, string msg) {
103-
exists(RegExpAlt root, RegExpSequence anchoredTerm, string direction |
104-
root = src.getRegExpTerm() and
105-
not containsInteriorAnchor(root) and
106-
not isEmpty(root.getAChild()) and
107-
not hasExplicitAnchorPrecedence(root) and
108-
containsLetters(anchoredTerm) and
109-
(
110-
anchoredTerm = root.getChild(0) and
111-
anchoredTerm.getChild(0) instanceof RegExpCaret and
112-
not containsLeadingPseudoAnchor(root.getChild([1 .. root.getNumChild() - 1])) and
113-
containsLetters(root.getChild([1 .. root.getNumChild() - 1])) and
114-
direction = "beginning"
115-
or
116-
anchoredTerm = root.getLastChild() and
117-
anchoredTerm.getLastChild() instanceof RegExpDollar and
118-
not containsTrailingPseudoAnchor(root.getChild([0 .. root.getNumChild() - 2])) and
119-
containsLetters(root.getChild([0 .. root.getNumChild() - 2])) and
120-
direction = "end"
121-
) and
122-
// is not used for replace
123-
not exists(DataFlow::MethodCallNode replace |
124-
replace.getMethodName() = "replace" and
125-
src.getARegExpObject().flowsTo(replace.getArgument(0))
126-
) and
127-
msg =
128-
"Misleading operator precedence. The subexpression '" + anchoredTerm.getRawValue() +
129-
"' is anchored at the " + direction +
130-
", but the other parts of this regular expression are not"
131-
)
132-
}
133-
134-
/**
135-
* Holds if `term` is a final term, that is, no term will match anything after this one.
136-
*/
137-
predicate isFinalRegExpTerm(RegExpTerm term) {
138-
term.isRootTerm()
139-
or
140-
exists(RegExpSequence seq |
141-
isFinalRegExpTerm(seq) and
142-
term = seq.getLastChild()
143-
)
144-
or
145-
exists(RegExpTerm parent |
146-
isFinalRegExpTerm(parent) and
147-
term = parent.getAChild() and
148-
not parent instanceof RegExpSequence and
149-
not parent instanceof RegExpQuantifier
150-
)
151-
}
152-
153-
/**
154-
* Holds if `src` contains a hostname pattern that is missing a `$` anchor.
155-
*/
156-
predicate isSemiAnchoredHostnameRegExp(RegExpPatternSource src, string msg) {
157-
not hasMisleadingAnchorPrecedence(src, _) and // avoid double reporting
158-
exists(RegExpTerm term, RegExpSequence tld, int i | term = src.getRegExpTerm() |
159-
not isConstantInvalidInsideOrigin(term.getAChild*()) and
160-
tld = term.getAChild*() and
161-
hasTopLevelDomainEnding(tld, i) and
162-
isFinalRegExpTerm(tld.getChild(i)) and // nothing is matched after the TLD
163-
tld.getChild(0) instanceof RegExpCaret and
164-
msg =
165-
"This hostname pattern may match any domain name, as it is missing a '$' or '/' at the end."
166-
)
167-
}
168-
169-
/**
170-
* Holds if `src` is an unanchored pattern for a URL, indicating a
171-
* mistake explained by `msg`.
172-
*/
173-
predicate isUnanchoredHostnameRegExp(RegExpPatternSource src, string msg) {
174-
exists(RegExpTerm term, RegExpSequence tld | term = src.getRegExpTerm() |
175-
alwaysMatchesHostname(term) and
176-
tld = term.getAChild*() and
177-
hasTopLevelDomainEnding(tld) and
178-
not isConstantInvalidInsideOrigin(term.getAChild*()) and
179-
not term.getAChild*() instanceof RegExpAnchor and
180-
// that is not used for capture or replace
181-
not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
15+
private import semmle.javascript.security.regexp.HostnameRegexp as HostnameRegexp
16+
private import codeql.regex.MissingRegExpAnchor as MissingRegExpAnchor
17+
private import semmle.javascript.security.regexp.RegExpTreeView::RegExpTreeView as TreeImpl
18+
19+
private module Impl implements
20+
MissingRegExpAnchor::MissingRegExpAnchorSig<TreeImpl, HostnameRegexp::Impl> {
21+
predicate isUsedAsReplace(RegExpPatternSource pattern) {
22+
// is used for capture or replace
23+
exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
18224
name = "exec" and
183-
mcn = src.getARegExpObject().getAMethodCall() and
25+
mcn = pattern.getARegExpObject().getAMethodCall() and
18426
exists(mcn.getAPropertyRead())
18527
or
18628
exists(DataFlow::Node arg |
18729
arg = mcn.getArgument(0) and
18830
(
189-
src.getARegExpObject().flowsTo(arg) or
190-
src.getAParse() = arg
31+
pattern.getARegExpObject().flowsTo(arg) or
32+
pattern.getAParse() = arg
19133
)
19234
|
19335
name = "replace"
19436
or
19537
name = "match" and exists(mcn.getAPropertyRead())
19638
)
197-
) and
198-
msg =
199-
"When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
200-
)
39+
)
40+
}
41+
42+
string getEndAnchorText() { result = "$" }
20143
}
20244

45+
import MissingRegExpAnchor::Make<TreeImpl, HostnameRegexp::Impl, Impl>
46+
20347
from DataFlow::Node nd, string msg
20448
where
20549
isUnanchoredHostnameRegExp(nd, msg)
20650
or
20751
isSemiAnchoredHostnameRegExp(nd, msg)
20852
or
20953
hasMisleadingAnchorPrecedence(nd, msg)
54+
// isLineAnchoredHostnameRegExp is not used here, as it is not relevant to JS.
21055
select nd, msg

python/ql/lib/semmle/python/RegexTreeView.qll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,9 @@ module Impl implements RegexTreeViewSig {
102102
/** Gets the number of child terms. */
103103
int getNumChild() { result = count(this.getAChild()) }
104104

105+
/** Gets the last child term of this element. */
106+
RegExpTerm getLastChild() { result = this.getChild(this.getNumChild() - 1) }
107+
105108
/** Gets the associated regex. */
106109
abstract Regex getRegex();
107110
}
@@ -561,6 +564,13 @@ module Impl implements RegexTreeViewSig {
561564
RegExpWordBoundary() { this.getChar() = "\\b" }
562565
}
563566

567+
/**
568+
* A non-word boundary, that is, a regular expression term of the form `\B`.
569+
*/
570+
class RegExpNonWordBoundary extends RegExpSpecialChar {
571+
RegExpNonWordBoundary() { this.getChar() = "\\B" }
572+
}
573+
564574
/**
565575
* A character class escape in a regular expression.
566576
* That is, an escaped character that denotes multiple characters.
@@ -829,6 +839,19 @@ module Impl implements RegexTreeViewSig {
829839
override string getPrimaryQLClass() { result = "RegExpDot" }
830840
}
831841

842+
/**
843+
* A term that matches a specific position between characters in the string.
844+
*
845+
* Example:
846+
*
847+
* ```
848+
* \A
849+
* ```
850+
*/
851+
class RegExpAnchor extends RegExpSpecialChar {
852+
RegExpAnchor() { this.getChar() = ["^", "$"] }
853+
}
854+
832855
/**
833856
* A dollar assertion `$` or `\Z` matching the end of a line.
834857
*
@@ -838,7 +861,7 @@ module Impl implements RegexTreeViewSig {
838861
* $
839862
* ```
840863
*/
841-
class RegExpDollar extends RegExpSpecialChar {
864+
class RegExpDollar extends RegExpAnchor {
842865
RegExpDollar() { this.getChar() = ["$", "\\Z"] }
843866

844867
override string getPrimaryQLClass() { result = "RegExpDollar" }
@@ -853,7 +876,7 @@ module Impl implements RegexTreeViewSig {
853876
* ^
854877
* ```
855878
*/
856-
class RegExpCaret extends RegExpSpecialChar {
879+
class RegExpCaret extends RegExpAnchor {
857880
RegExpCaret() { this.getChar() = ["^", "\\A"] }
858881

859882
override string getPrimaryQLClass() { result = "RegExpCaret" }

0 commit comments

Comments
 (0)