Skip to content

Commit 01d581e

Browse files
authored
Merge pull request github#5250 from tausbn/python-port-re-security-queries
Python: Port URL sanitisation queries to API graphs
2 parents 2b54c33 + 404649d commit 01d581e

File tree

4 files changed

+88
-28
lines changed

4 files changed

+88
-28
lines changed

python/ql/src/semmle/python/regex.qll

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,39 @@
11
import python
2-
import semmle.python.objects.ObjectInternal
2+
deprecated import semmle.python.objects.ObjectInternal as OI
3+
private import semmle.python.ApiGraphs
34

4-
private predicate re_module_function(string name, int flags) {
5-
name = "compile" and flags = 1
5+
/**
6+
* Gets the positional argument index containing the regular expression flags for the member of the
7+
* `re` module with the name `name`.
8+
*/
9+
private int re_member_flags_arg(string name) {
10+
name = "compile" and result = 1
611
or
7-
name = "search" and flags = 2
12+
name = "search" and result = 2
813
or
9-
name = "match" and flags = 2
14+
name = "match" and result = 2
1015
or
11-
name = "split" and flags = 3
16+
name = "split" and result = 3
1217
or
13-
name = "findall" and flags = 2
18+
name = "findall" and result = 2
1419
or
15-
name = "finditer" and flags = 2
20+
name = "finditer" and result = 2
1621
or
17-
name = "sub" and flags = 4
22+
name = "sub" and result = 4
1823
or
19-
name = "subn" and flags = 4
24+
name = "subn" and result = 4
2025
}
2126

2227
/**
23-
* Gets the names and corresponding values of attributes of the `re` module that are likely to be
28+
* Gets the names and corresponding API nodes of members of the `re` module that are likely to be
2429
* methods taking regular expressions as arguments.
2530
*
2631
* This is a helper predicate that fixes a bad join order, and should not be inlined without checking
2732
* that this is safe.
2833
*/
2934
pragma[nomagic]
30-
private Value relevant_re_attr(string name) {
31-
result = Module::named("re").attr(name) and
35+
private API::Node relevant_re_member(string name) {
36+
result = API::moduleImport("re").getMember(name) and
3237
name != "escape"
3338
}
3439

@@ -39,24 +44,78 @@ private Value relevant_re_attr(string name) {
3944
predicate used_as_regex(Expr s, string mode) {
4045
(s instanceof Bytes or s instanceof Unicode) and
4146
/* Call to re.xxx(regex, ... [mode]) */
42-
exists(CallNode call, string name |
43-
call.getArg(0).pointsTo(_, _, s.getAFlowNode()) and
44-
call.getFunction().pointsTo(relevant_re_attr(name))
47+
exists(DataFlow::CallCfgNode call, string name |
48+
call.getArg(0).asExpr() = s and
49+
call = relevant_re_member(name).getACall()
4550
|
4651
mode = "None"
4752
or
48-
exists(Value obj | mode = mode_from_mode_object(obj) |
49-
exists(int flags_arg |
50-
re_module_function(name, flags_arg) and
51-
call.getArg(flags_arg).pointsTo(obj)
52-
)
53-
or
54-
call.getArgByName("flags").pointsTo(obj)
53+
mode = mode_from_node([call.getArg(re_member_flags_arg(name)), call.getArgByName("flags")])
54+
)
55+
}
56+
57+
/**
58+
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
59+
* that have multiple names, we pick the long-form name as a canonical representative.
60+
*/
61+
private string canonical_name(API::Node flag) {
62+
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and
63+
flag = API::moduleImport("re").getMember([result, result.prefix(1)])
64+
or
65+
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL"
66+
or
67+
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE"
68+
}
69+
70+
/**
71+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
72+
* to the `re` flag with the canonical name `flag_name`
73+
*/
74+
private DataFlow::LocalSourceNode re_flag_tracker(string flag_name, DataFlow::TypeTracker t) {
75+
t.start() and
76+
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.getAUse())
77+
or
78+
exists(BinaryExprNode binop, DataFlow::Node operand |
79+
operand.getALocalSource() = re_flag_tracker(flag_name, t.continue()) and
80+
operand.asCfgNode() = binop.getAnOperand() and
81+
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and
82+
result.asCfgNode() = binop
83+
)
84+
or
85+
// Due to bad performance when using normal setup with `re_flag_tracker(t2, attr_name).track(t2, t)`
86+
// we have inlined that code and forced a join
87+
exists(DataFlow::TypeTracker t2 |
88+
exists(DataFlow::StepSummary summary |
89+
re_flag_tracker_first_join(t2, flag_name, result, summary) and
90+
t = t2.append(summary)
5591
)
5692
)
5793
}
5894

59-
string mode_from_mode_object(Value obj) {
95+
pragma[nomagic]
96+
private predicate re_flag_tracker_first_join(
97+
DataFlow::TypeTracker t2, string flag_name, DataFlow::Node res, DataFlow::StepSummary summary
98+
) {
99+
DataFlow::StepSummary::step(re_flag_tracker(flag_name, t2), res, summary)
100+
}
101+
102+
/**
103+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
104+
* to the `re` flag with the canonical name `flag_name`
105+
*/
106+
private DataFlow::Node re_flag_tracker(string flag_name) {
107+
re_flag_tracker(flag_name, DataFlow::TypeTracker::end()).flowsTo(result)
108+
}
109+
110+
/** Gets a regular expression mode flag associated with the given data flow node. */
111+
string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) }
112+
113+
/**
114+
* DEPRECATED 2021-02-24 -- use `mode_from_node` instead.
115+
*
116+
* Gets a regular expression mode flag associated with the given value.
117+
*/
118+
deprecated string mode_from_mode_object(Value obj) {
60119
(
61120
result = "DEBUG" or
62121
result = "IGNORECASE" or
@@ -67,8 +126,8 @@ string mode_from_mode_object(Value obj) {
67126
result = "VERBOSE"
68127
) and
69128
exists(int flag |
70-
flag = Value::named("sre_constants.SRE_FLAG_" + result).(ObjectInternal).intValue() and
71-
obj.(ObjectInternal).intValue().bitAnd(flag) = flag
129+
flag = Value::named("sre_constants.SRE_FLAG_" + result).(OI::ObjectInternal).intValue() and
130+
obj.(OI::ObjectInternal).intValue().bitAnd(flag) = flag
72131
)
73132
}
74133

python/ql/test/library-tests/regex/Mode.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,7 @@
77
| 50 | VERBOSE |
88
| 51 | UNICODE |
99
| 52 | UNICODE |
10+
| 54 | DOTALL |
11+
| 54 | VERBOSE |
1012
| 56 | VERBOSE |
1113
| 68 | MULTILINE |

python/ql/test/library-tests/regex/options

Lines changed: 0 additions & 1 deletion
This file was deleted.

python/ql/test/library-tests/regex/test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
re.search("", None, re.UNICODE)
5252
x = re.search("", flags=re.UNICODE)
5353
# using addition for flags was reported as FP in https://github.com/github/codeql/issues/4707
54-
re.compile("", re.VERBOSE+re.DOTALL) # TODO: Currently not recognized with Mode.ql
54+
re.compile("", re.VERBOSE+re.DOTALL)
5555
# re.X is an alias for re.VERBOSE
5656
re.compile("", re.X)
5757

0 commit comments

Comments
 (0)