Skip to content

Commit 556bb41

Browse files
committed
move all code to find Regex flag into a module
1 parent f0254fc commit 556bb41

File tree

1 file changed

+97
-114
lines changed

1 file changed

+97
-114
lines changed

python/ql/lib/semmle/python/regex.qll

Lines changed: 97 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -3,72 +3,6 @@ private import semmle.python.ApiGraphs
33
// Need to import since frameworks can extend the abstract `RegExpInterpretation::Range`
44
private import semmle.python.Frameworks
55
private import semmle.python.Concepts as Concepts
6-
7-
/**
8-
* Gets the positional argument index containing the regular expression flags for the member of the
9-
* `re` module with the name `name`.
10-
*/
11-
private int re_member_flags_arg(string name) {
12-
name = "compile" and result = 1
13-
or
14-
name = "search" and result = 2
15-
or
16-
name = "match" and result = 2
17-
or
18-
name = "split" and result = 3
19-
or
20-
name = "findall" and result = 2
21-
or
22-
name = "finditer" and result = 2
23-
or
24-
name = "sub" and result = 4
25-
or
26-
name = "subn" and result = 4
27-
}
28-
29-
/**
30-
* Gets the names and corresponding API nodes of members of the `re` module that are likely to be
31-
* methods taking regular expressions as arguments.
32-
*
33-
* This is a helper predicate that fixes a bad join order, and should not be inlined without checking
34-
* that this is safe.
35-
*/
36-
pragma[nomagic]
37-
private API::Node relevant_re_member(string name) {
38-
result = API::moduleImport("re").getMember(name) and
39-
name != "escape"
40-
}
41-
42-
/**
43-
* Holds if the expression `e` is used as a regex with the `re` module, with the regex-mode `mode` (if known).
44-
* If regex mode is not known, `mode` will be `"None"`.
45-
*
46-
* This predicate has not done any data-flow tracking.
47-
*/
48-
// TODO: This should only be used to get the `mode`, and nowhere else.
49-
predicate used_as_regex_internal(Expr e, string mode) {
50-
/* Call to re.xxx(regex, ... [mode]) */
51-
exists(DataFlow::CallCfgNode call |
52-
call instanceof Concepts::RegexExecution and
53-
e = call.(Concepts::RegexExecution).getRegex().asExpr()
54-
or
55-
call.getArg(0).asExpr() = e and
56-
call = relevant_re_member(_).getACall()
57-
|
58-
mode = "None"
59-
or
60-
exists(DataFlow::CallCfgNode callNode |
61-
call = callNode and
62-
mode =
63-
mode_from_node([
64-
callNode
65-
.getArg(re_member_flags_arg(callNode.(DataFlow::MethodCallNode).getMethodName())),
66-
callNode.getArgByName("flags")
67-
])
68-
)
69-
)
70-
}
71-
726
private import regexp.internal.RegExpTracking as RegExpTracking
737
private import semmle.python.Concepts
748
private import semmle.python.regexp.RegexTreeView
@@ -81,49 +15,6 @@ RegExpTerm getTermForExecution(RegexExecution exec) {
8115
)
8216
}
8317

84-
/**
85-
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
86-
* that have multiple names, we pick the long-form name as a canonical representative.
87-
*/
88-
private string canonical_name(API::Node flag) {
89-
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and
90-
flag = API::moduleImport("re").getMember([result, result.prefix(1)])
91-
or
92-
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL"
93-
or
94-
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE"
95-
}
96-
97-
/**
98-
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
99-
* to the `re` flag with the canonical name `flag_name`
100-
*/
101-
private DataFlow::TypeTrackingNode re_flag_tracker(string flag_name, DataFlow::TypeTracker t) {
102-
t.start() and
103-
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.asSource())
104-
or
105-
exists(BinaryExprNode binop, DataFlow::Node operand |
106-
operand.getALocalSource() = re_flag_tracker(flag_name, t.continue()) and
107-
operand.asCfgNode() = binop.getAnOperand() and
108-
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and
109-
result.asCfgNode() = binop
110-
)
111-
or
112-
exists(DataFlow::TypeTracker t2 | result = re_flag_tracker(flag_name, t2).track(t2, t))
113-
}
114-
115-
/**
116-
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
117-
* to the `re` flag with the canonical name `flag_name`
118-
*/
119-
private DataFlow::Node re_flag_tracker(string flag_name) {
120-
re_flag_tracker(flag_name, DataFlow::TypeTracker::end()).flowsTo(result)
121-
}
122-
123-
/** Gets a regular expression mode flag associated with the given data flow node. */
124-
// TODO: Move this into a RegexFlag module, along with related code?
125-
string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) }
126-
12718
/** Provides a class for modeling regular expression interpretations. */
12819
module RegExpInterpretation {
12920
/**
@@ -150,6 +41,102 @@ deprecated class RegexString extends Regex {
15041
RegexString() { this = RegExpTracking::regExpSource(_).asExpr() }
15142
}
15243

44+
/** Utility predicates for finding the mode of a regex based on where it's used. */
45+
private module FindRegexMode {
46+
// TODO: Movev this (and Regex) into a ParseRegExp file.
47+
/**
48+
* Gets the mode of the regex `regex` based on the context where it's used.
49+
* Does not find the mode if it's in a prefix inside the regex itself (see `Regex::getAMode`).
50+
*/
51+
string getAMode(Regex regex) {
52+
exists(DataFlow::Node sink |
53+
sink = regex.getAUse() and
54+
/* Call to re.xxx(regex, ... [mode]) */
55+
exists(DataFlow::CallCfgNode call |
56+
call instanceof Concepts::RegexExecution and
57+
sink = call.(Concepts::RegexExecution).getRegex()
58+
or
59+
call.getArg(_) = sink and
60+
sink instanceof RegExpInterpretation::Range
61+
|
62+
exists(DataFlow::CallCfgNode callNode |
63+
call = callNode and
64+
result =
65+
mode_from_node([
66+
callNode
67+
.getArg(re_member_flags_arg(callNode.(DataFlow::MethodCallNode).getMethodName())),
68+
callNode.getArgByName("flags")
69+
])
70+
)
71+
)
72+
)
73+
}
74+
75+
/**
76+
* Gets the positional argument index containing the regular expression flags for the member of the
77+
* `re` module with the name `name`.
78+
*/
79+
private int re_member_flags_arg(string name) {
80+
name = "compile" and result = 1
81+
or
82+
name = "search" and result = 2
83+
or
84+
name = "match" and result = 2
85+
or
86+
name = "split" and result = 3
87+
or
88+
name = "findall" and result = 2
89+
or
90+
name = "finditer" and result = 2
91+
or
92+
name = "sub" and result = 4
93+
or
94+
name = "subn" and result = 4
95+
}
96+
97+
/**
98+
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
99+
* that have multiple names, we pick the long-form name as a canonical representative.
100+
*/
101+
private string canonical_name(API::Node flag) {
102+
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and
103+
flag = API::moduleImport("re").getMember([result, result.prefix(1)])
104+
or
105+
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL"
106+
or
107+
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE"
108+
}
109+
110+
/**
111+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
112+
* to the `re` flag with the canonical name `flag_name`
113+
*/
114+
private DataFlow::TypeTrackingNode re_flag_tracker(string flag_name, DataFlow::TypeTracker t) {
115+
t.start() and
116+
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.asSource())
117+
or
118+
exists(BinaryExprNode binop, DataFlow::Node operand |
119+
operand.getALocalSource() = re_flag_tracker(flag_name, t.continue()) and
120+
operand.asCfgNode() = binop.getAnOperand() and
121+
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and
122+
result.asCfgNode() = binop
123+
)
124+
or
125+
exists(DataFlow::TypeTracker t2 | result = re_flag_tracker(flag_name, t2).track(t2, t))
126+
}
127+
128+
/**
129+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
130+
* to the `re` flag with the canonical name `flag_name`
131+
*/
132+
private DataFlow::Node re_flag_tracker(string flag_name) {
133+
re_flag_tracker(flag_name, DataFlow::TypeTracker::end()).flowsTo(result)
134+
}
135+
136+
/** Gets a regular expression mode flag associated with the given data flow node. */
137+
private string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) }
138+
}
139+
153140
/** A StrConst used as a regular expression */
154141
class Regex extends Expr {
155142
DataFlow::Node sink;
@@ -175,11 +162,7 @@ class Regex extends Expr {
175162
* VERBOSE
176163
*/
177164
string getAMode() {
178-
exists(string mode |
179-
used_as_regex_internal(sink.asExpr(), mode) and
180-
result != "None" and
181-
result = mode
182-
)
165+
result = FindRegexMode::getAMode(this)
183166
or
184167
result = this.getModeFromPrefix()
185168
}

0 commit comments

Comments
 (0)