Skip to content

Commit 6bb98b0

Browse files
authored
Merge pull request #17577 from yoff/python/add-comprehension-capture-flow
python: capture flow through comprehensions
2 parents 04a4fb2 + 6f5b949 commit 6bb98b0

File tree

17 files changed

+170
-60
lines changed

17 files changed

+170
-60
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
category: minorAnalysis
3+
---
4+
* More precise modelling of the dataflow through comprehensions. In particular, captured variables are now handled correctly.
5+
* Dataflow out of yield is added, allowing proper tracing through generators.

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowDispatch.qll

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,10 @@ newtype TArgumentPosition =
163163
// position, we need to ensure we make these available (these are specified as
164164
// parameters in the flow-summary spec)
165165
FlowSummaryImpl::ParsePositions::isParsedPositionalParameterPosition(_, index)
166+
or
167+
// the generated function inside a comprehension has a positional argument at index 0
168+
exists(Comp c) and
169+
index = 0
166170
} or
167171
TKeywordArgumentPosition(string name) {
168172
exists(any(CallNode c).getArgByName(name))
@@ -314,12 +318,11 @@ newtype TDataFlowCallable =
314318
* class instantiations, and (in the future) special methods.
315319
*/
316320
TFunction(Function func) {
317-
// For generators/list-comprehensions we create a synthetic function. In the
318-
// points-to call-graph these were not considered callable, and instead we added
319-
// data-flow steps (read/write) for these. As an easy solution for now, we do the
320-
// same to keep things easy to reason about (and therefore exclude things that do
321-
// not have a definition)
321+
// Functions with an explicit definition
322322
exists(func.getDefinition())
323+
or
324+
// For generators/list-comprehensions we create a synthetic function.
325+
exists(Comp c | c.getFunction() = func)
323326
} or
324327
/** see QLDoc for `DataFlowModuleScope` for why we need this. */
325328
TModule(Module m) or
@@ -1381,6 +1384,8 @@ private predicate sameEnclosingCallable(Node node1, Node node2) {
13811384
// =============================================================================
13821385
newtype TDataFlowCall =
13831386
TNormalCall(CallNode call, Function target, CallType type) { resolveCall(call, target, type) } or
1387+
/** A call to the generated function inside a comprehension */
1388+
TComprehensionCall(Comp c) or
13841389
TPotentialLibraryCall(CallNode call) or
13851390
/** A synthesized call inside a summarized callable */
13861391
TSummaryCall(
@@ -1467,6 +1472,34 @@ class NormalCall extends ExtractedDataFlowCall, TNormalCall {
14671472
CallType getCallType() { result = type }
14681473
}
14691474

1475+
/** A call to the generated function inside a comprhension */
1476+
class ComprehensionCall extends ExtractedDataFlowCall, TComprehensionCall {
1477+
Comp c;
1478+
Function target;
1479+
1480+
ComprehensionCall() {
1481+
this = TComprehensionCall(c) and
1482+
target = c.getFunction()
1483+
}
1484+
1485+
Comp getComprehension() { result = c }
1486+
1487+
override string toString() { result = "comprehension call" }
1488+
1489+
override ControlFlowNode getNode() { result.getNode() = c }
1490+
1491+
override Scope getScope() { result = c.getScope() }
1492+
1493+
override DataFlowCallable getCallable() { result.(DataFlowFunction).getScope() = target }
1494+
1495+
override ArgumentNode getArgument(ArgumentPosition apos) {
1496+
result.asExpr() = c.getIterable() and
1497+
apos.isPositional(0)
1498+
}
1499+
1500+
override Location getLocation() { result = c.getLocation() }
1501+
}
1502+
14701503
/**
14711504
* A potential call to a summarized callable, a `LibraryCallable`.
14721505
*
@@ -1697,6 +1730,47 @@ class CapturedVariablesArgumentNode extends CfgNode, ArgumentNode {
16971730
}
16981731
}
16991732

1733+
/** A synthetic node representing the values of variables captured by a comprehension. */
1734+
class SynthCompCapturedVariablesArgumentNode extends Node, TSynthCompCapturedVariablesArgumentNode,
1735+
ArgumentNode
1736+
{
1737+
Comp comp;
1738+
1739+
SynthCompCapturedVariablesArgumentNode() { this = TSynthCompCapturedVariablesArgumentNode(comp) }
1740+
1741+
override string toString() { result = "Capturing closure argument (comp)" }
1742+
1743+
override Scope getScope() { result = comp.getScope() }
1744+
1745+
override Location getLocation() { result = comp.getLocation() }
1746+
1747+
Comp getComprehension() { result = comp }
1748+
1749+
override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {
1750+
call.(ComprehensionCall).getComprehension() = comp and
1751+
pos.isLambdaSelf()
1752+
}
1753+
}
1754+
1755+
/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
1756+
class SynthCompCapturedVariablesArgumentPostUpdateNode extends PostUpdateNodeImpl,
1757+
TSynthCompCapturedVariablesArgumentPostUpdateNode
1758+
{
1759+
Comp comp;
1760+
1761+
SynthCompCapturedVariablesArgumentPostUpdateNode() {
1762+
this = TSynthCompCapturedVariablesArgumentPostUpdateNode(comp)
1763+
}
1764+
1765+
override string toString() { result = "[post] Capturing closure argument (comp)" }
1766+
1767+
override Scope getScope() { result = comp.getScope() }
1768+
1769+
override Location getLocation() { result = comp.getLocation() }
1770+
1771+
override Node getPreUpdateNode() { result = TSynthCompCapturedVariablesArgumentNode(comp) }
1772+
}
1773+
17001774
/** Gets a viable run-time target for the call `call`. */
17011775
DataFlowCallable viableCallable(DataFlowCall call) {
17021776
call instanceof ExtractedDataFlowCall and
@@ -1736,7 +1810,10 @@ abstract class ReturnNode extends Node {
17361810
/** A data flow node that represents a value returned by a callable. */
17371811
class ExtractedReturnNode extends ReturnNode, CfgNode {
17381812
// See `TaintTrackingImplementation::returnFlowStep`
1739-
ExtractedReturnNode() { node = any(Return ret).getValue().getAFlowNode() }
1813+
ExtractedReturnNode() {
1814+
node = any(Return ret).getValue().getAFlowNode() or
1815+
node = any(Yield yield).getAFlowNode()
1816+
}
17401817

17411818
override ReturnKind getKind() { any() }
17421819
}

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,26 @@ private predicate synthDictSplatArgumentNodeStoreStep(
168168
)
169169
}
170170

171+
/**
172+
* Holds if `nodeFrom` is the value yielded by the `yield` found at `nodeTo`.
173+
*
174+
* For example, in
175+
* ```python
176+
* for x in l:
177+
* yield x.name
178+
* ```
179+
* data from `x.name` is stored into the `yield` (and can subsequently be read out of the iterable).
180+
*/
181+
predicate yieldStoreStep(Node nodeFrom, Content c, Node nodeTo) {
182+
exists(Yield yield |
183+
nodeTo.asCfgNode() = yield.getAFlowNode() and
184+
nodeFrom.asCfgNode() = yield.getValue().getAFlowNode() and
185+
// TODO: Consider if this will also need to transfer dictionary content
186+
// once dictionary comprehensions are supported.
187+
c instanceof ListElementContent
188+
)
189+
}
190+
171191
/**
172192
* Ensures that the a `**kwargs` parameter will not contain elements with names of
173193
* keyword parameters.
@@ -256,6 +276,12 @@ abstract class PostUpdateNodeImpl extends Node {
256276
abstract Node getPreUpdateNode();
257277
}
258278

279+
/**
280+
* A post-update node synthesised for an existing control flow node.
281+
* Add to `TSyntheticPostUpdateNode` to get the synthetic post-update node synthesised.
282+
*
283+
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
284+
*/
259285
class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNode {
260286
ControlFlowNode node;
261287

@@ -270,6 +296,11 @@ class SyntheticPostUpdateNode extends PostUpdateNodeImpl, TSyntheticPostUpdateNo
270296
override Location getLocation() { result = node.getLocation() }
271297
}
272298

299+
/**
300+
* An existsing control flow node being the post-update node of a synthetic pre-update node.
301+
*
302+
* Synthetic post-update nodes for synthetic nodes need to be listed one by one.
303+
*/
273304
class NonSyntheticPostUpdateNode extends PostUpdateNodeImpl, CfgNode {
274305
SyntheticPreUpdateNode pre;
275306

@@ -668,8 +699,6 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
668699
or
669700
setStoreStep(nodeFrom, c, nodeTo)
670701
or
671-
comprehensionStoreStep(nodeFrom, c, nodeTo)
672-
or
673702
attributeStoreStep(nodeFrom, c, nodeTo)
674703
or
675704
matchStoreStep(nodeFrom, c, nodeTo)
@@ -683,6 +712,8 @@ predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
683712
or
684713
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
685714
or
715+
yieldStoreStep(nodeFrom, c, nodeTo)
716+
or
686717
VariableCapture::storeStep(nodeFrom, c, nodeTo)
687718
}
688719

@@ -843,31 +874,6 @@ predicate dictClearStep(Node node, DictionaryElementContent c) {
843874
)
844875
}
845876

846-
/** Data flows from an element expression in a comprehension to the comprehension. */
847-
predicate comprehensionStoreStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
848-
// Comprehension
849-
// `[x+1 for x in l]`
850-
// nodeFrom is `x+1`, cfg node
851-
// nodeTo is `[x+1 for x in l]`, cfg node
852-
// c denotes list or set or dictionary without index
853-
//
854-
// List
855-
nodeTo.getNode().getNode().(ListComp).getElt() = nodeFrom.getNode().getNode() and
856-
c instanceof ListElementContent
857-
or
858-
// Set
859-
nodeTo.getNode().getNode().(SetComp).getElt() = nodeFrom.getNode().getNode() and
860-
c instanceof SetElementContent
861-
or
862-
// Dictionary
863-
nodeTo.getNode().getNode().(DictComp).getElt() = nodeFrom.getNode().getNode() and
864-
c instanceof DictionaryElementAnyContent
865-
or
866-
// Generator
867-
nodeTo.getNode().getNode().(GeneratorExp).getElt() = nodeFrom.getNode().getNode() and
868-
c instanceof ListElementContent
869-
}
870-
871877
/**
872878
* Holds if `nodeFrom` flows into the attribute `c` of `nodeTo` via an attribute assignment.
873879
*

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ newtype TNode =
7171
def.getDefiningNode() = node and
7272
def.getParameter() = func.getArg(0)
7373
)
74+
or
75+
// the iterable argument to the implicit comprehension function
76+
node.getNode() = any(Comp c).getIterable()
7477
} or
7578
/** A node representing a global (module-level) variable in a specific module. */
7679
TModuleVariableNode(Module m, GlobalVariable v) {
@@ -124,9 +127,16 @@ newtype TNode =
124127
/** A synthetic node representing the heap of a function. Used for variable capture. */
125128
TSynthCapturedVariablesParameterNode(Function f) {
126129
f = any(VariableCapture::CapturedVariable v).getACapturingScope() and
127-
// TODO: Remove this restriction when adding proper support for captured variables in the body of the function we generate for comprehensions
128130
exists(TFunction(f))
129131
} or
132+
/** A synthetic node representing the values of variables captured by a comprehension. */
133+
TSynthCompCapturedVariablesArgumentNode(Comp comp) {
134+
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
135+
} or
136+
/** A synthetic node representing the values of variables captured by a comprehension after the output has been computed. */
137+
TSynthCompCapturedVariablesArgumentPostUpdateNode(Comp comp) {
138+
comp.getFunction() = any(VariableCapture::CapturedVariable v).getACapturingScope()
139+
} or
130140
/** An empty, unused node type that exists to prevent unwanted dependencies on data flow nodes. */
131141
TForbiddenRecursionGuard() {
132142
none() and
@@ -350,6 +360,9 @@ class ExtractedArgumentNode extends ArgumentNode {
350360
or
351361
// and self arguments
352362
this.asCfgNode() = any(CallNode c).getFunction().(AttrNode).getObject()
363+
or
364+
// for comprehensions, we allow the synthetic `iterable` argument
365+
this.asExpr() = any(Comp c).getIterable()
353366
}
354367

355368
final override predicate argumentOf(DataFlowCall call, ArgumentPosition pos) {

python/ql/lib/semmle/python/dataflow/new/internal/IterableUnpacking.qll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ class ForTarget extends ControlFlowNode {
187187
)
188188
or
189189
exists(Comp comp |
190-
source = comp.getIterable() and
190+
source = comp.getFunction().getArg(0) and
191191
this.getNode() = comp.getNthInnerLoop(0).getTarget()
192192
)
193193
}

python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ predicate containerStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
186186
// TODO: once we have proper flow-summary modeling, we might not need this step any
187187
// longer -- but there needs to be a matching read-step for the store-step, and we
188188
// don't provide that right now.
189-
DataFlowPrivate::comprehensionStoreStep(nodeFrom, _, nodeTo)
189+
DataFlowPrivate::yieldStoreStep(nodeFrom, _, nodeTo)
190190
}
191191

192192
/**

python/ql/lib/semmle/python/dataflow/new/internal/VariableCapture.qll

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,11 @@ module Flow = Shared::Flow<Location, CaptureInput>;
127127
private Flow::ClosureNode asClosureNode(Node n) {
128128
result = n.(SynthCaptureNode).getSynthesizedCaptureNode()
129129
or
130+
exists(Comp comp | n = TSynthCompCapturedVariablesArgumentNode(comp) |
131+
result.(Flow::ExprNode).getExpr().getNode() = comp
132+
)
133+
or
134+
// TODO: Should the `Comp`s above be excluded here?
130135
result.(Flow::ExprNode).getExpr() = n.(CfgNode).getNode()
131136
or
132137
result.(Flow::VariableWriteSourceNode).getVariableWrite() = n.(CfgNode).getNode()

python/ql/lib/semmle/python/frameworks/Stdlib.qll

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4251,22 +4251,27 @@ module StdlibPrivate {
42514251

42524252
override predicate propagatesFlow(string input, string output, boolean preservesValue) {
42534253
// The positional argument contains a mapping.
4254-
// TODO: Add the list-of-pairs version
42554254
// TODO: these values can be overwritten by keyword arguments
4255+
// - dict mapping
42564256
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
42574257
input = "Argument[0].DictionaryElement[" + key + "]" and
42584258
output = "ReturnValue.DictionaryElement[" + key + "]" and
42594259
preservesValue = true
42604260
)
42614261
or
4262+
// - list-of-pairs mapping
4263+
input = "Argument[0].ListElement.TupleElement[1]" and
4264+
output = "ReturnValue.DictionaryElementAny" and
4265+
preservesValue = true
4266+
or
42624267
// The keyword arguments are added to the dictionary.
42634268
exists(DataFlow::DictionaryElementContent dc, string key | key = dc.getKey() |
42644269
input = "Argument[" + key + ":]" and
42654270
output = "ReturnValue.DictionaryElement[" + key + "]" and
42664271
preservesValue = true
42674272
)
42684273
or
4269-
// Imprecise content in any argument ends up on the container itself.
4274+
// Imprecise content in the first argument ends up on the container itself.
42704275
input = "Argument[0]" and
42714276
output = "ReturnValue" and
42724277
preservesValue = false

python/ql/test/experimental/query-tests/Security/CWE-022-TarSlip/TarSlip.expected

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ edges
3434
| TarSlipImprov.py:142:9:142:13 | ControlFlowNode for entry | TarSlipImprov.py:143:36:143:40 | ControlFlowNode for entry | provenance | |
3535
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | provenance | |
3636
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | provenance | Config |
37+
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | provenance | |
3738
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | provenance | |
39+
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
3840
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | provenance | |
3941
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | TarSlipImprov.py:162:20:162:23 | ControlFlowNode for tarc | provenance | |
4042
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | provenance | |
@@ -131,6 +133,7 @@ nodes
131133
| TarSlipImprov.py:151:14:151:50 | ControlFlowNode for closing() | semmle.label | ControlFlowNode for closing() |
132134
| TarSlipImprov.py:151:22:151:49 | ControlFlowNode for Attribute() | semmle.label | ControlFlowNode for Attribute() |
133135
| TarSlipImprov.py:151:55:151:56 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
136+
| TarSlipImprov.py:152:13:152:20 | ControlFlowNode for Yield | semmle.label | ControlFlowNode for Yield |
134137
| TarSlipImprov.py:152:19:152:20 | ControlFlowNode for tf | semmle.label | ControlFlowNode for tf |
135138
| TarSlipImprov.py:157:9:157:14 | ControlFlowNode for tar_cm | semmle.label | ControlFlowNode for tar_cm |
136139
| TarSlipImprov.py:157:18:157:40 | ControlFlowNode for py2_tarxz() | semmle.label | ControlFlowNode for py2_tarxz() |

python/ql/test/library-tests/dataflow/coverage/localFlow.expected

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,4 @@
88
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SINK | test.py:210:5:210:8 | ControlFlowNode for SINK |
99
| test.py:208:1:208:53 | Entry definition for SsaSourceVariable SOURCE | test.py:209:25:209:30 | ControlFlowNode for SOURCE |
1010
| test.py:209:5:209:5 | ControlFlowNode for x | test.py:210:10:210:10 | ControlFlowNode for x |
11-
| test.py:209:9:209:68 | ControlFlowNode for .0 | test.py:209:9:209:68 | ControlFlowNode for .0 |
1211
| test.py:209:9:209:68 | ControlFlowNode for ListComp | test.py:209:5:209:5 | ControlFlowNode for x |
13-
| test.py:209:16:209:16 | ControlFlowNode for v | test.py:209:45:209:45 | ControlFlowNode for v |
14-
| test.py:209:40:209:40 | ControlFlowNode for u | test.py:209:56:209:56 | ControlFlowNode for u |
15-
| test.py:209:51:209:51 | ControlFlowNode for z | test.py:209:67:209:67 | ControlFlowNode for z |
16-
| test.py:209:62:209:62 | ControlFlowNode for y | test.py:209:10:209:10 | ControlFlowNode for y |

0 commit comments

Comments
 (0)