Skip to content

Commit e812eb7

Browse files
committed
Python: Port URL sanitisation queries to API graphs
Really, this boils down to "Port `re` library model to use API graphs instead of points-to", which is what this PR actually does. Instead of using points-to to track flags, we use a type tracker. To handle multiple flags at the same time, we add additional flow from `x` to `x | y` and `y | x` and, as an added bonus, the above with `+` instead of `|`, neatly fixing github#4707 I had to modify the `Qualified.ql` test slightly, as it now had a result stemming from the standard library (in `warnings.py`) that points-to previously ignored. It might be possible to implement this as a type tracker on `LocalSourceNode`s, but with the added steps for the above operations, this was not obvious to me, and so I opted for the simpler "`smallstep`" variant.
1 parent f65843a commit e812eb7

File tree

4 files changed

+73
-29
lines changed

4 files changed

+73
-29
lines changed

python/ql/src/semmle/python/regex.qll

Lines changed: 68 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,39 @@
11
import python
2-
import semmle.python.objects.ObjectInternal
2+
deprecated import semmle.python.objects.ObjectInternal as OI
3+
import semmle.python.ApiGraphs
34

4-
private predicate re_module_function(string name, int flags) {
5-
name = "compile" and flags = 1
5+
/**
6+
* Gets the positional argument index containing the regular expression flags for the member of the
7+
* `re` module with the name `name`.
8+
*/
9+
private int re_member_flags_arg(string name) {
10+
name = "compile" and result = 1
611
or
7-
name = "search" and flags = 2
12+
name = "search" and result = 2
813
or
9-
name = "match" and flags = 2
14+
name = "match" and result = 2
1015
or
11-
name = "split" and flags = 3
16+
name = "split" and result = 3
1217
or
13-
name = "findall" and flags = 2
18+
name = "findall" and result = 2
1419
or
15-
name = "finditer" and flags = 2
20+
name = "finditer" and result = 2
1621
or
17-
name = "sub" and flags = 4
22+
name = "sub" and result = 4
1823
or
19-
name = "subn" and flags = 4
24+
name = "subn" and result = 4
2025
}
2126

2227
/**
23-
* Gets the names and corresponding values of attributes of the `re` module that are likely to be
28+
* Gets the names and corresponding API nodes of members of the `re` module that are likely to be
2429
* methods taking regular expressions as arguments.
2530
*
2631
* This is a helper predicate that fixes a bad join order, and should not be inlined without checking
2732
* that this is safe.
2833
*/
2934
pragma[nomagic]
30-
private Value relevant_re_attr(string name) {
31-
result = Module::named("re").attr(name) and
35+
private API::Node relevant_re_member(string name) {
36+
result = API::moduleImport("re").getMember(name) and
3237
name != "escape"
3338
}
3439

@@ -39,24 +44,60 @@ private Value relevant_re_attr(string name) {
3944
predicate used_as_regex(Expr s, string mode) {
4045
(s instanceof Bytes or s instanceof Unicode) and
4146
/* Call to re.xxx(regex, ... [mode]) */
42-
exists(CallNode call, string name |
43-
call.getArg(0).pointsTo(_, _, s.getAFlowNode()) and
44-
call.getFunction().pointsTo(relevant_re_attr(name))
47+
exists(DataFlow::CallCfgNode call, string name |
48+
call.getArg(0).asExpr() = s and
49+
call = relevant_re_member(name).getACall()
4550
|
4651
mode = "None"
4752
or
48-
exists(Value obj | mode = mode_from_mode_object(obj) |
49-
exists(int flags_arg |
50-
re_module_function(name, flags_arg) and
51-
call.getArg(flags_arg).pointsTo(obj)
52-
)
53-
or
54-
call.getArgByName("flags").pointsTo(obj)
55-
)
53+
mode = mode_from_node([call.getArg(re_member_flags_arg(name)), call.getArgByName("flags")])
54+
)
55+
}
56+
57+
/**
58+
* Gets the canonical name for the API graph node corresponding to the `re` flag `flag`. For flags
59+
* that have multiple names, we pick the long-form name as a canonical representative.
60+
*/
61+
private string canonical_name(API::Node flag) {
62+
result in ["ASCII", "IGNORECASE", "LOCALE", "UNICODE", "MULTILINE", "TEMPLATE"] and
63+
flag = API::moduleImport("re").getMember([result, result.prefix(1)])
64+
or
65+
flag = API::moduleImport("re").getMember(["DOTALL", "S"]) and result = "DOTALL"
66+
or
67+
flag = API::moduleImport("re").getMember(["VERBOSE", "X"]) and result = "VERBOSE"
68+
}
69+
70+
/**
71+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
72+
* to the `re` flag with the canonical name `flag_name`
73+
*/
74+
private DataFlow::Node re_flag_tracker(string flag_name, DataFlow::TypeTracker t) {
75+
t.start() and
76+
exists(API::Node flag | flag_name = canonical_name(flag) and result = flag.getAUse())
77+
or
78+
exists(BinaryExprNode binop |
79+
(binop.getOp() instanceof BitOr or binop.getOp() instanceof Add) and
80+
binop.getAnOperand() = re_flag_tracker(flag_name, t.continue()).asCfgNode() and
81+
result.asCfgNode() = binop
82+
)
83+
or
84+
exists(DataFlow::TypeTracker t2, DataFlow::Node prev | prev = re_flag_tracker(flag_name, t2) |
85+
t2 = t.smallstep(prev, result)
5686
)
5787
}
5888

59-
string mode_from_mode_object(Value obj) {
89+
/**
90+
* A type tracker for regular expression flag names. Holds if the result is a node that may refer
91+
* to the `re` flag with the canonical name `flag_name`
92+
*/
93+
private DataFlow::Node re_flag_tracker(string flag_name) {
94+
result = re_flag_tracker(flag_name, DataFlow::TypeTracker::end())
95+
}
96+
97+
/** Gets a regular expression mode flag associated with the given data flow node. */
98+
string mode_from_node(DataFlow::Node node) { node = re_flag_tracker(result) }
99+
100+
deprecated string mode_from_mode_object(Value obj) {
60101
(
61102
result = "DEBUG" or
62103
result = "IGNORECASE" or
@@ -67,8 +108,8 @@ string mode_from_mode_object(Value obj) {
67108
result = "VERBOSE"
68109
) and
69110
exists(int flag |
70-
flag = Value::named("sre_constants.SRE_FLAG_" + result).(ObjectInternal).intValue() and
71-
obj.(ObjectInternal).intValue().bitAnd(flag) = flag
111+
flag = Value::named("sre_constants.SRE_FLAG_" + result).(OI::ObjectInternal).intValue() and
112+
obj.(OI::ObjectInternal).intValue().bitAnd(flag) = flag
72113
)
73114
}
74115

python/ql/test/library-tests/regex/Mode.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,7 @@
77
| 50 | VERBOSE |
88
| 51 | UNICODE |
99
| 52 | UNICODE |
10+
| 54 | DOTALL |
11+
| 54 | VERBOSE |
1012
| 56 | VERBOSE |
1113
| 68 | MULTILINE |

python/ql/test/library-tests/regex/Qualified.ql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@ import python
22
import semmle.python.regex
33

44
from Regex r, int start, int end, boolean maybe_empty
5-
where r.qualifiedItem(start, end, maybe_empty)
5+
where
6+
r.qualifiedItem(start, end, maybe_empty) and r.getLocation().getFile().getShortName() = "test.py"
67
select r.getText(), start, end, maybe_empty

python/ql/test/library-tests/regex/test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
re.search("", None, re.UNICODE)
5252
x = re.search("", flags=re.UNICODE)
5353
# using addition for flags was reported as FP in https://github.com/github/codeql/issues/4707
54-
re.compile("", re.VERBOSE+re.DOTALL) # TODO: Currently not recognized with Mode.ql
54+
re.compile("", re.VERBOSE+re.DOTALL)
5555
# re.X is an alias for re.VERBOSE
5656
re.compile("", re.X)
5757

0 commit comments

Comments
 (0)