Skip to content

Commit 36e18d5

Browse files
committed
python: dataflow for match
- also update `validTest.py`, but commented out for now otherwise CI will fail until we force it to run with Python 3.10 - added debug utility for dataflow (`dataflowTestPaths.ql`)
1 parent bb210f4 commit 36e18d5

File tree

8 files changed

+527
-6
lines changed

8 files changed

+527
-6
lines changed

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 304 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,8 @@ module EssaFlow {
249249
// Flow inside an unpacking assignment
250250
iterableUnpackingFlowStep(nodeFrom, nodeTo)
251251
or
252+
matchFlowStep(nodeFrom, nodeTo)
253+
or
252254
// Overflow keyword argument
253255
exists(CallNode call, CallableValue callable |
254256
call = callable.getACall() and
@@ -982,6 +984,8 @@ predicate storeStep(Node nodeFrom, Content c, Node nodeTo) {
982984
posOverflowStoreStep(nodeFrom, c, nodeTo)
983985
or
984986
kwOverflowStoreStep(nodeFrom, c, nodeTo)
987+
or
988+
matchStoreStep(nodeFrom, c, nodeTo)
985989
}
986990

987991
/** Data flows from an element of a list to the list. */
@@ -1124,6 +1128,8 @@ predicate readStep(Node nodeFrom, Content c, Node nodeTo) {
11241128
or
11251129
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
11261130
or
1131+
matchReadStep(nodeFrom, c, nodeTo)
1132+
or
11271133
popReadStep(nodeFrom, c, nodeTo)
11281134
or
11291135
forReadStep(nodeFrom, c, nodeTo)
@@ -1553,6 +1559,290 @@ module IterableUnpacking {
15531559

15541560
import IterableUnpacking
15551561

1562+
/**
1563+
* There are a number of patterns available for the match statement.
1564+
* Each one transfers data and content differently to its parts.
1565+
*
1566+
* Furthermore, given a successful match, we can infer some daa about
1567+
* the subject. Consider the example:
1568+
* ```python
1569+
* match choice:
1570+
* case 'Y':
1571+
* ...body
1572+
* ```
1573+
* Inside `body`, we know that `choice` has the value `'Y'`.
1574+
*
1575+
* A similar thing happens with the "as pattern". Consider the example:
1576+
* ```python
1577+
* match choice:
1578+
* case ('y'|'Y') as c:
1579+
* ...body
1580+
* ```
1581+
* By the binding rules, there is data flow from `choice` to `c`. But we
1582+
* can infer the value of `c` to be either `'y'` or `'Y'` if the match succeeds.
1583+
*
1584+
* We will treat such inference separately as guards. First we will model the data flow
1585+
* stemming from the bindings and the matching of shape. Below, 'subject' is not necessarily the
1586+
* top-level subject of the match, but rather the part recursively matched by the current pattern.
1587+
* For instance, in the example:
1588+
* ```python
1589+
* match command:
1590+
* case ('quit' as c) | ('go', ('up'|'down') as c):
1591+
* ...body
1592+
* ```
1593+
* `command` is the subject of the as-pattern, while the second component of `command` is the subject
1594+
* of the first capture pattern. As such, 'subject' refers to the pattern under evaluation.
1595+
*
1596+
* - as pattern: subject flows to alias as well as to the interior pattern
1597+
* - or pattern: subject flows to each alternative
1598+
* - literal pattern: no flow
1599+
* - capture pattern: subject flows to the variable
1600+
* - wildcard pattern: no flow
1601+
* - value pattern: no flow
1602+
* - sequence pattern: each element reads from subject at the associated index
1603+
* - star pattern: subject flows to the variable, possibly via a conversion
1604+
* - mapping pattern: each value reads from subject at the associated key
1605+
* - double star pattern: subject flows to the variable, possibly via a conversion
1606+
* - key-value pattern: the value reads from the subject at the key (see mapping pattern)
1607+
* - class pattern: all keywords read the appropriate attribute from the subject
1608+
* - keyword pattern: the appropriate attribute is read from the subject (see class pattern)
1609+
*
1610+
* Inside the class pattern, we also find positional arguments. They are converted to
1611+
* keyword arguments using the `__match_args__` attribute on the class. We do not
1612+
* currently model this.
1613+
*/
1614+
module MatchUnpacking {
1615+
/**
1616+
* The subject of a match flows to each top-level pattern
1617+
* (a pattern directly under a `case` statement).
1618+
*
1619+
* We could consider a model closer to use-use-flow, where the subject
1620+
* only flows to the first top-level pattern and from there to the
1621+
* following ones.
1622+
*/
1623+
predicate matchSubjectFlowStep(Node nodeFrom, Node nodeTo) {
1624+
exists(MatchStmt match, Expr subject, Pattern target |
1625+
subject = match.getSubject() and
1626+
target = match.getCase(_).(Case).getPattern()
1627+
|
1628+
nodeFrom.asExpr() = subject and
1629+
nodeTo.asCfgNode().getNode() = target
1630+
)
1631+
}
1632+
1633+
/**
1634+
* as pattern: subject flows to alias as well as to the interior pattern
1635+
* syntax (toplevel): `case pattern as alias:`
1636+
*/
1637+
predicate matchAsFlowStep(Node nodeFrom, Node nodeTo) {
1638+
exists(MatchAsPattern subject, Name alias | alias = subject.getAlias() |
1639+
nodeFrom.asCfgNode().getNode() = subject and
1640+
(
1641+
// the subject flows to the alias
1642+
nodeTo.asVar().getDefinition().(PatternAliasDefinition).getDefiningNode().getNode() = alias
1643+
or
1644+
// the subject flows to the interior pattern
1645+
nodeTo.asCfgNode().getNode() = subject.getPattern()
1646+
)
1647+
)
1648+
}
1649+
1650+
/**
1651+
* or pattern: subject flows to each alternative
1652+
* syntax (toplevel): `case alt1 | alt2:`
1653+
*/
1654+
predicate matchOrFlowStep(Node nodeFrom, Node nodeTo) {
1655+
exists(MatchOrPattern subject, Pattern pattern | pattern = subject.getAPattern() |
1656+
nodeFrom.asCfgNode().getNode() = subject and
1657+
nodeTo.asCfgNode().getNode() = pattern
1658+
)
1659+
}
1660+
1661+
/**
1662+
* capture pattern: subject flows to the variable
1663+
* syntax (toplevel): `case var:`
1664+
*/
1665+
predicate matchCaptureFlowStep(Node nodeFrom, Node nodeTo) {
1666+
exists(MatchCapturePattern capture, Name var | capture.getVariable() = var |
1667+
nodeFrom.asCfgNode().getNode() = capture and
1668+
nodeTo.asVar().getDefinition().(PatternCaptureDefinition).getDefiningNode().getNode() = var
1669+
)
1670+
}
1671+
1672+
/**
1673+
* sequence pattern: each element reads from subject at the associated index
1674+
* syntax (toplevel): `case [a, b]:`
1675+
*/
1676+
predicate matchSequenceReadStep(Node nodeFrom, Content c, Node nodeTo) {
1677+
exists(MatchSequencePattern subject, int index, Pattern element |
1678+
element = subject.getPattern(index)
1679+
|
1680+
nodeFrom.asCfgNode().getNode() = subject and
1681+
nodeTo.asCfgNode().getNode() = element and
1682+
(
1683+
// tuple content
1684+
c.(TupleElementContent).getIndex() = index
1685+
or
1686+
// list content
1687+
c instanceof ListElementContent
1688+
// set content is excluded from sequence patterns,
1689+
// see https://www.python.org/dev/peps/pep-0635/#sequence-patterns
1690+
)
1691+
)
1692+
}
1693+
1694+
/**
1695+
* star pattern: subject flows to the variable, possibly via a conversion
1696+
* syntax (toplevel): `case *var:`
1697+
*
1698+
* We decompose this flow into a read step and a store step. The read step
1699+
* reads both tupe and list content, the store step only stores list content.
1700+
* This way, we convert all content to list content.
1701+
*
1702+
* This is the read step.
1703+
*/
1704+
predicate matchStarReadStep(Node nodeFrom, Content c, Node nodeTo) {
1705+
exists(MatchSequencePattern subject, int index, MatchStarPattern star |
1706+
star = subject.getPattern(index)
1707+
|
1708+
nodeFrom.asCfgNode().getNode() = subject and
1709+
nodeTo = TStarPatternElementNode(star) and
1710+
(
1711+
// tuple content
1712+
c.(TupleElementContent).getIndex() >= index
1713+
or
1714+
// list content
1715+
c instanceof ListElementContent
1716+
// set content is excluded from sequence patterns,
1717+
// see https://www.python.org/dev/peps/pep-0635/#sequence-patterns
1718+
)
1719+
)
1720+
}
1721+
1722+
/**
1723+
* star pattern: subject flows to the variable, possibly via a conversion
1724+
* syntax (toplevel): `case *var:`
1725+
*
1726+
* We decompose this flow into a read step and a store step. The read step
1727+
* reads both tupe and list content, the store step only stores list content.
1728+
* This way, we convert all content to list content.
1729+
*
1730+
* This is the store step.
1731+
*/
1732+
predicate matchStarStoreStep(Node nodeFrom, Content c, Node nodeTo) {
1733+
exists(MatchStarPattern star |
1734+
nodeFrom = TStarPatternElementNode(star) and
1735+
nodeTo.asCfgNode().getNode() = star.getTarget() and
1736+
c instanceof ListElementContent
1737+
)
1738+
}
1739+
1740+
/**
1741+
* mapping pattern: each value reads from subject at the associated key
1742+
* syntax (toplevel): `case {"color": c, "height": x}:`
1743+
*/
1744+
predicate matchMappingReadStep(Node nodeFrom, Content c, Node nodeTo) {
1745+
exists(
1746+
MatchMappingPattern subject, MatchKeyValuePattern keyValue, MatchLiteralPattern key,
1747+
Pattern value
1748+
|
1749+
keyValue = subject.getAMapping() and
1750+
key = keyValue.getKey() and
1751+
value = keyValue.getValue()
1752+
|
1753+
nodeFrom.asCfgNode().getNode() = subject and
1754+
nodeTo.asCfgNode().getNode() = value and
1755+
c.(DictionaryElementContent).getKey() = key.getLiteral().(StrConst).getText()
1756+
)
1757+
}
1758+
1759+
/**
1760+
* double star pattern: subject flows to the variable, possibly via a conversion
1761+
* syntax (toplevel): `case {**var}:`
1762+
*
1763+
* Dictionary content flows to the double star, but all mentioned keys in the
1764+
* mapping pattern should be cleared.
1765+
*/
1766+
predicate matchMappingFlowStep(Node nodeFrom, Node nodeTo) {
1767+
exists(MatchMappingPattern subject, MatchDoubleStarPattern dstar |
1768+
dstar = subject.getAMapping()
1769+
|
1770+
nodeFrom.asCfgNode().getNode() = subject and
1771+
nodeTo.asCfgNode().getNode() = dstar.getTarget()
1772+
)
1773+
}
1774+
1775+
/**
1776+
* Bindings that are mentioned in a mapping pattern will not be available
1777+
* to a double star pattern in the same mapping pattern.
1778+
*/
1779+
predicate matchMappingClearStep(Node n, Content c) {
1780+
exists(
1781+
MatchMappingPattern subject, MatchKeyValuePattern keyValue, MatchLiteralPattern key,
1782+
MatchDoubleStarPattern dstar
1783+
|
1784+
keyValue = subject.getAMapping() and
1785+
key = keyValue.getKey() and
1786+
dstar = subject.getAMapping()
1787+
|
1788+
n.asCfgNode().getNode() = dstar.getTarget() and
1789+
c.(DictionaryElementContent).getKey() = key.getLiteral().(StrConst).getText()
1790+
)
1791+
}
1792+
1793+
/**
1794+
* class pattern: all keywords read the appropriate attribute from the subject
1795+
* syntax (toplevel): `case ClassName(attr = val):`
1796+
*/
1797+
predicate matchClassReadStep(Node nodeFrom, Content c, Node nodeTo) {
1798+
exists(MatchClassPattern subject, MatchKeywordPattern keyword, Name attr, Pattern value |
1799+
keyword = subject.getKeyword(_) and
1800+
attr = keyword.getAttribute() and
1801+
value = keyword.getValue()
1802+
|
1803+
nodeFrom.asCfgNode().getNode() = subject and
1804+
nodeTo.asCfgNode().getNode() = value and
1805+
c.(AttributeContent).getAttribute() = attr.getId()
1806+
)
1807+
}
1808+
1809+
/** All flow steps associated with match. */
1810+
predicate matchFlowStep(Node nodeFrom, Node nodeTo) {
1811+
matchSubjectFlowStep(nodeFrom, nodeTo)
1812+
or
1813+
matchAsFlowStep(nodeFrom, nodeTo)
1814+
or
1815+
matchOrFlowStep(nodeFrom, nodeTo)
1816+
or
1817+
matchCaptureFlowStep(nodeFrom, nodeTo)
1818+
or
1819+
matchMappingFlowStep(nodeFrom, nodeTo)
1820+
}
1821+
1822+
/** All read steps associated with match. */
1823+
predicate matchReadStep(Node nodeFrom, Content c, Node nodeTo) {
1824+
matchClassReadStep(nodeFrom, c, nodeTo)
1825+
or
1826+
matchSequenceReadStep(nodeFrom, c, nodeTo)
1827+
or
1828+
matchMappingReadStep(nodeFrom, c, nodeTo)
1829+
or
1830+
matchStarReadStep(nodeFrom, c, nodeTo)
1831+
}
1832+
1833+
/** All store steps associated with match. */
1834+
predicate matchStoreStep(Node nodeFrom, Content c, Node nodeTo) {
1835+
matchStarStoreStep(nodeFrom, c, nodeTo)
1836+
}
1837+
1838+
/**
1839+
* All clear steps associated with match
1840+
*/
1841+
predicate matchClearStep(Node n, Content c) { matchMappingClearStep(n, c) }
1842+
}
1843+
1844+
import MatchUnpacking
1845+
15561846
/** Data flows from a sequence to a call to `pop` on the sequence. */
15571847
predicate popReadStep(CfgNode nodeFrom, Content c, CfgNode nodeTo) {
15581848
// set.pop or list.pop
@@ -1635,18 +1925,28 @@ predicate kwUnpackReadStep(CfgNode nodeFrom, DictionaryElementContent c, Node no
16351925
}
16361926

16371927
/**
1638-
* Holds if values stored inside content `c` are cleared at node `n`. For example,
1639-
* any value stored inside `f` is cleared at the pre-update node associated with `x`
1640-
* in `x.f = newValue`.
1928+
* Clear content at key `name` of the synthesized dictionary `TKwOverflowNode(call, callable)`,
1929+
* whenever `call` unpacks `name`.
16411930
*/
1642-
predicate clearsContent(Node n, Content c) {
1931+
predicate kwOverflowClearStep(Node n, Content c) {
16431932
exists(CallNode call, CallableValue callable, string name |
16441933
call_unpacks(call, _, callable, name, _) and
16451934
n = TKwOverflowNode(call, callable) and
16461935
c.(DictionaryElementContent).getKey() = name
16471936
)
16481937
}
16491938

1939+
/**
1940+
* Holds if values stored inside content `c` are cleared at node `n`. For example,
1941+
* any value stored inside `f` is cleared at the pre-update node associated with `x`
1942+
* in `x.f = newValue`.
1943+
*/
1944+
predicate clearsContent(Node n, Content c) {
1945+
kwOverflowClearStep(n, c)
1946+
or
1947+
matchClearStep(n, c)
1948+
}
1949+
16501950
//--------
16511951
// Fancy context-sensitive guards
16521952
//--------

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,11 @@ newtype TNode =
2525
/** A node corresponding to an SSA variable. */
2626
TEssaNode(EssaVariable var) or
2727
/** A node corresponding to a control flow node. */
28-
TCfgNode(ControlFlowNode node) { isExpressionNode(node) } or
28+
TCfgNode(ControlFlowNode node) {
29+
isExpressionNode(node)
30+
or
31+
node.getNode() instanceof Pattern
32+
} or
2933
/** A synthetic node representing the value of an object before a state change */
3034
TSyntheticPreUpdateNode(NeedsSyntheticPreUpdateNode post) or
3135
/** A synthetic node representing the value of an object after a state change. */
@@ -79,7 +83,11 @@ newtype TNode =
7983
* A synthetic node representing that there may be an iterable element
8084
* for `consumer` to consume.
8185
*/
82-
TIterableElementNode(UnpackingAssignmentTarget consumer)
86+
TIterableElementNode(UnpackingAssignmentTarget consumer) or
87+
/**
88+
* A synthetic node representing element content in a star pattern.
89+
*/
90+
TStarPatternElementNode(MatchStarPattern target)
8391

8492
/** Helper for `Node::getEnclosingCallable`. */
8593
private DataFlowCallable getCallableScope(Scope s) {
@@ -476,6 +484,21 @@ class IterableElementNode extends Node, TIterableElementNode {
476484
override Location getLocation() { result = consumer.getLocation() }
477485
}
478486

487+
/**
488+
* A synthetic node representing elemnt content of a star pattern.
489+
*/
490+
class StarPatternElementNode extends Node, TStarPatternElementNode {
491+
CfgNode consumer;
492+
493+
StarPatternElementNode() { this = TStarPatternElementNode(consumer.getNode().getNode()) }
494+
495+
override string toString() { result = "StarPatternElement" }
496+
497+
override DataFlowCallable getEnclosingCallable() { result = consumer.getEnclosingCallable() }
498+
499+
override Location getLocation() { result = consumer.getLocation() }
500+
}
501+
479502
/**
480503
* A node that controls whether other nodes are evaluated.
481504
*/

python/ql/test/experimental/dataflow/match/dataflowTest.expected

Whitespace-only changes.

0 commit comments

Comments
 (0)