Skip to content

Commit 4943fc5

Browse files
committed
Python: Model taint from re.<func> calls
1 parent 851c30e commit 4943fc5

File tree

2 files changed

+86
-11
lines changed

2 files changed

+86
-11
lines changed

python/ql/lib/semmle/python/frameworks/Stdlib.qll

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3157,6 +3157,80 @@ private module StdlibPrivate {
31573157
}
31583158
}
31593159

3160+
/**
3161+
* A flow summary for `re` methods not returning a `re.Match` object
3162+
*
3163+
* See https://docs.python.org/3/library/re.html#functions
3164+
*/
3165+
class ReFunctionsSummary extends SummarizedCallable {
3166+
string methodName;
3167+
3168+
ReFunctionsSummary() {
3169+
methodName in ["split", "findall", "finditer", "sub", "subn"] and
3170+
this = ["re.", "compiled re."] + methodName
3171+
}
3172+
3173+
override DataFlow::CallCfgNode getACall() {
3174+
this = "re." + methodName and
3175+
result = API::moduleImport("re").getMember(methodName).getACall()
3176+
or
3177+
this = "compiled re." + methodName and
3178+
result =
3179+
any(RePatternSummary c)
3180+
.getACall()
3181+
.(API::CallNode)
3182+
.getReturn()
3183+
.getMember(methodName)
3184+
.getACall()
3185+
}
3186+
3187+
override DataFlow::ArgumentNode getACallback() { none() }
3188+
3189+
override predicate propagatesFlowExt(string input, string output, boolean preservesValue) {
3190+
exists(int offset |
3191+
// for non-compiled regex the first argument is the pattern, so we need to
3192+
// account for this difference
3193+
this = "re." + methodName and offset = 0
3194+
or
3195+
this = "compiled re." + methodName and offset = 1
3196+
|
3197+
// flow from input string to results
3198+
exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset |
3199+
preservesValue = false and
3200+
input in ["Argument[" + arg + "]", "Argument[string:]"] and
3201+
(
3202+
methodName in ["split", "findall", "finditer"] and
3203+
output = "ReturnValue.ListElement"
3204+
or
3205+
// TODO: Since we currently model lists as tainted, the result of findall and split needs to be tainted
3206+
methodName in ["split", "findall"] and
3207+
output = "ReturnValue"
3208+
or
3209+
methodName = "sub" and
3210+
output = "ReturnValue"
3211+
or
3212+
methodName = "subn" and
3213+
output = "ReturnValue.TupleElement[0]"
3214+
)
3215+
)
3216+
or
3217+
// flow from replacement value for substitution
3218+
exists(string argumentSpec |
3219+
argumentSpec in ["Argument[" + (1 - offset) + "]", "Argument[repl:]"] and
3220+
// `repl` can also be a function
3221+
input = [argumentSpec, argumentSpec + ".ReturnValue"]
3222+
|
3223+
(
3224+
methodName = "sub" and output = "ReturnValue"
3225+
or
3226+
methodName = "subn" and output = "ReturnValue.TupleElement[0]"
3227+
) and
3228+
preservesValue = false
3229+
)
3230+
)
3231+
}
3232+
}
3233+
31603234
/**
31613235
* A call to 're.escape'.
31623236
* See https://docs.python.org/3/library/re.html#re.escape

python/ql/test/library-tests/frameworks/stdlib/test_re.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,29 +13,30 @@
1313
re.fullmatch(pat, ts), # $ MISSING: tainted
1414

1515
# other functions not returning Match objects
16-
re.split(pat, ts), # $ MISSING: tainted
17-
re.split(pat, ts)[0], # $ MISSING: tainted
16+
re.split(pat, ts), # $ tainted
17+
re.split(pat, ts)[0], # $ tainted
1818

19-
re.findall(pat, ts), # $ MISSING: tainted
19+
re.findall(pat, ts), # $ tainted
20+
re.findall(pat, ts)[0], # $ tainted
2021

2122
re.finditer(pat, ts), # $ MISSING: tainted
22-
[x for x in re.finditer(pat, ts)], # $ MISSING: tainted
23+
[x for x in re.finditer(pat, ts)], # $ tainted
2324

24-
re.sub(pat, repl="safe", string=ts), # $ MISSING: tainted
25-
re.sub(pat, repl=lambda m: ..., string=ts), # $ MISSING: tainted
26-
re.sub(pat, repl=ts, string="safe"), # $ MISSING: tainted
27-
re.sub(pat, repl=lambda m: ts, string="safe"), # $ MISSING: tainted
25+
re.sub(pat, repl="safe", string=ts), # $ tainted
26+
re.sub(pat, repl=lambda m: ..., string=ts), # $ tainted
27+
re.sub(pat, repl=ts, string="safe"), # $ tainted
28+
re.sub(pat, repl=lambda m: ts, string="safe"), # $ tainted
2829

2930
re.subn(pat, repl="safe", string=ts), # $ MISSING: tainted
30-
re.subn(pat, repl="safe", string=ts)[0], # $ MISSING: tainted // the string
31+
re.subn(pat, repl="safe", string=ts)[0], # $ tainted // the string
3132

3233
# same for compiled patterns
3334
compiled_pat.search(ts), # $ MISSING: tainted
3435
compiled_pat.match(ts), # $ MISSING: tainted
3536
compiled_pat.fullmatch(ts), # $ MISSING: tainted
3637

37-
compiled_pat.split(ts), # $ MISSING: tainted
38-
compiled_pat.split(ts)[0], # $ MISSING: tainted
38+
compiled_pat.split(ts), # $ tainted
39+
compiled_pat.split(ts)[0], # $ tainted
3940

4041
# ...
4142

0 commit comments

Comments
 (0)