Skip to content

Commit b19d64f

Browse files
authored
Merge pull request github#6013 from RasmusWL/sensitive-improvements
Python: Improve sensitive data modeling
2 parents 19305a2 + d19bc12 commit b19d64f

File tree

7 files changed

+363
-29
lines changed

7 files changed

+363
-29
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
lgtm,codescanning
2+
* Expanded modeling of sensitive data sources to include: subscripting with a key that indicates sensitive data (`obj["password"]`), parameters whose names indicate sensitive data (`def func(password):`), and assignments to variables whose names indicate sensitive data (`password = ...`).

python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll

Lines changed: 220 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55

66
private import python
77
private import semmle.python.dataflow.new.DataFlow
8-
// Need to import since frameworks can extend `RemoteFlowSource::Range`
8+
// Need to import `semmle.python.Frameworks` since frameworks can extend `SensitiveDataSource::Range`
99
private import semmle.python.Frameworks
10-
private import semmle.python.Concepts
11-
private import semmle.python.security.SensitiveData as OldSensitiveData
10+
private import semmle.python.security.internal.SensitiveDataHeuristics as SensitiveDataHeuristics
11+
12+
// We export these explicitly, so we don't also export the `HeuristicNames` module.
13+
class SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;
14+
15+
module SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;
1216

1317
/**
1418
* A data flow source of sensitive data, such as secrets, certificates, or passwords.
@@ -22,13 +26,9 @@ class SensitiveDataSource extends DataFlow::Node {
2226
SensitiveDataSource() { this = range }
2327

2428
/**
25-
* INTERNAL: Do not use.
26-
*
27-
* This will be rewritten to have better types soon, and therefore should only be used internally until then.
28-
*
2929
* Gets the classification of the sensitive data.
3030
*/
31-
string getClassification() { result = range.getClassification() }
31+
SensitiveDataClassification getClassification() { result = range.getClassification() }
3232
}
3333

3434
/** Provides a class for modeling new sources of sensitive data, such as secrets, certificates, or passwords. */
@@ -41,26 +41,225 @@ module SensitiveDataSource {
4141
*/
4242
abstract class Range extends DataFlow::Node {
4343
/**
44-
* INTERNAL: Do not use.
45-
*
46-
* This will be rewritten to have better types soon, and therefore should only be used internally until then.
47-
*
4844
* Gets the classification of the sensitive data.
4945
*/
50-
abstract string getClassification();
46+
abstract SensitiveDataClassification getClassification();
5147
}
5248
}
5349

54-
private class PortOfOldModeling extends SensitiveDataSource::Range {
55-
OldSensitiveData::SensitiveData::Source oldSensitiveSource;
50+
/** Actual sensitive data modeling */
51+
private module SensitiveDataModeling {
52+
private import SensitiveDataHeuristics::HeuristicNames
5653

57-
PortOfOldModeling() { this.asCfgNode() = oldSensitiveSource }
54+
/**
55+
* Gets a reference to a function that is considered to be a sensitive source of
56+
* `classification`.
57+
*/
58+
private DataFlow::LocalSourceNode sensitiveFunction(
59+
DataFlow::TypeTracker t, SensitiveDataClassification classification
60+
) {
61+
t.start() and
62+
exists(Function f |
63+
nameIndicatesSensitiveData(f.getName(), classification) and
64+
result.asExpr() = f.getDefinition()
65+
)
66+
or
67+
exists(DataFlow::TypeTracker t2 | result = sensitiveFunction(t2, classification).track(t2, t))
68+
}
5869

59-
override string getClassification() {
60-
exists(OldSensitiveData::SensitiveData classification |
61-
oldSensitiveSource.isSourceOf(classification)
62-
|
63-
classification = "sensitive.data." + result
70+
/**
71+
* Gets a reference to a function that is considered to be a sensitive source of
72+
* `classification`.
73+
*/
74+
DataFlow::Node sensitiveFunction(SensitiveDataClassification classification) {
75+
sensitiveFunction(DataFlow::TypeTracker::end(), classification).flowsTo(result)
76+
}
77+
78+
/**
79+
* Gets a reference to a string constant that, if used as the key in a lookup,
80+
* indicates the presence of sensitive data with `classification`.
81+
*/
82+
private DataFlow::LocalSourceNode sensitiveLookupStringConst(
83+
DataFlow::TypeTracker t, SensitiveDataClassification classification
84+
) {
85+
t.start() and
86+
nameIndicatesSensitiveData(result.asExpr().(StrConst).getText(), classification)
87+
or
88+
exists(DataFlow::TypeTracker t2 |
89+
result = sensitiveLookupStringConst(t2, classification).track(t2, t)
6490
)
6591
}
92+
93+
/**
94+
* Gets a reference to a string constant that, if used as the key in a lookup,
95+
* indicates the presence of sensitive data with `classification`.
96+
*
97+
* Also see `extraStepForCalls`.
98+
*/
99+
DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) {
100+
sensitiveLookupStringConst(DataFlow::TypeTracker::end(), classification).flowsTo(result)
101+
}
102+
103+
/** A function call that is considered a source of sensitive data. */
104+
class SensitiveFunctionCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
105+
SensitiveDataClassification classification;
106+
107+
SensitiveFunctionCall() {
108+
this.getFunction() = sensitiveFunction(classification)
109+
or
110+
// to cover functions that we don't have the definition for, and where the
111+
// reference to the function has not already been marked as being sensitive
112+
nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
113+
}
114+
115+
override SensitiveDataClassification getClassification() { result = classification }
116+
}
117+
118+
/**
119+
* Tracks any modeled source of sensitive data (with any classification),
120+
* to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
121+
*/
122+
private DataFlow::LocalSourceNode possibleSensitiveCallable(DataFlow::TypeTracker t) {
123+
t.start() and
124+
result instanceof SensitiveDataSource
125+
or
126+
exists(DataFlow::TypeTracker t2 | result = possibleSensitiveCallable(t2).track(t2, t))
127+
}
128+
129+
/**
130+
* Tracks any modeled source of sensitive data (with any classification),
131+
* to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
132+
*/
133+
private DataFlow::Node possibleSensitiveCallable() {
134+
possibleSensitiveCallable(DataFlow::TypeTracker::end()).flowsTo(result)
135+
}
136+
137+
/**
138+
* Holds if the step from `nodeFrom` to `nodeTo` should be considered a
139+
* taint-flow step for sensitive-data, to ensure calls are handled correctly.
140+
*
141+
* To handle calls properly, while preserving a good source for path explanations,
142+
* you need to include this predicate as an additional taint step in your taint-tracking
143+
* configurations.
144+
*
145+
* The core problem can be illustrated by the example below. If we consider the
146+
* `print` a sink, what path and what source do we want to show? My initial approach
147+
* would be to use type-tracking to propagate from the `not_found.get_passwd` attribute
148+
* lookup, to the use of `non_sensitive_name`, and then create a new `SensitiveDataSource::Range`
149+
* like `SensitiveFunctionCall`. Although that seems likely to work, it will also end up
150+
* with a non-optimal path, which starts at _bad source_, and therefore doesn't show
151+
* how we figured out that `non_sensitive_name`
152+
* could be a function that returns a password (and in cases where there is many calls to
153+
* `my_func` it will be annoying for someone to figure this out manually).
154+
*
155+
* By including this additional taint-step in the taint-tracking configuration, it's possible
156+
* to get a path explanation going from _good source_ to the sink.
157+
*
158+
* ```python
159+
* def my_func(non_sensitive_name):
160+
* x = non_sensitive_name() # <-- bad source
161+
* print(x) # <-- sink
162+
*
163+
* import not_found
164+
* f = not_found.get_passwd # <-- good source
165+
* my_func(f)
166+
* ```
167+
*/
168+
predicate extraStepForCalls(DataFlow::Node nodeFrom, DataFlow::CallCfgNode nodeTo) {
169+
// However, we do still use the type-tracking approach to limit the size of this
170+
// predicate.
171+
nodeTo.getFunction() = nodeFrom and
172+
nodeFrom = possibleSensitiveCallable()
173+
}
174+
175+
/**
176+
* Any kind of variable assignment (also including with/for) where the name indicates
177+
* it contains sensitive data.
178+
*
179+
* Note: We _could_ make any access to a variable with a sensitive name a source of
180+
* sensitive data, but to make path explanations in data-flow/taint-tracking good,
181+
* we don't want that, since it works against allowing users to understand the flow
182+
* in the program (which is the whole point).
183+
*
184+
* Note: To make data-flow/taint-tracking work, the expression that is _assigned_ to
185+
* the variable is marked as the source (as compared to marking the variable as the
186+
* source).
187+
*/
188+
class SensitiveVariableAssignment extends SensitiveDataSource::Range {
189+
SensitiveDataClassification classification;
190+
191+
SensitiveVariableAssignment() {
192+
exists(DefinitionNode def |
193+
nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
194+
(
195+
this.asCfgNode() = def.getValue()
196+
or
197+
this.asCfgNode() = def.getValue().(ForNode).getSequence()
198+
) and
199+
not this.asExpr() instanceof FunctionExpr and
200+
not this.asExpr() instanceof ClassExpr
201+
)
202+
or
203+
exists(With with |
204+
nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
205+
this.asExpr() = with.getContextExpr()
206+
)
207+
}
208+
209+
override SensitiveDataClassification getClassification() { result = classification }
210+
}
211+
212+
/** An attribute access that is considered a source of sensitive data. */
213+
class SensitiveAttributeAccess extends SensitiveDataSource::Range {
214+
SensitiveDataClassification classification;
215+
216+
SensitiveAttributeAccess() {
217+
// Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
218+
// I considered excluding any `from ... import something_sensitive`, but then realized that
219+
// we should flag up `form ... import password as ...` as a password
220+
nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
221+
or
222+
// Things like `getattr(foo, <reference-to-string>)`
223+
this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
224+
}
225+
226+
override SensitiveDataClassification getClassification() { result = classification }
227+
}
228+
229+
/** A subscript, where the key indicates the result will be sensitive data. */
230+
class SensitiveSubscript extends SensitiveDataSource::Range {
231+
SensitiveDataClassification classification;
232+
233+
SensitiveSubscript() {
234+
this.asCfgNode().(SubscriptNode).getIndex() =
235+
sensitiveLookupStringConst(classification).asCfgNode()
236+
}
237+
238+
override SensitiveDataClassification getClassification() { result = classification }
239+
}
240+
241+
/** A call to `get` on an object, where the key indicates the result will be sensitive data. */
242+
class SensitiveGetCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
243+
SensitiveDataClassification classification;
244+
245+
SensitiveGetCall() {
246+
this.getFunction().asCfgNode().(AttrNode).getName() = "get" and
247+
this.getArg(0) = sensitiveLookupStringConst(classification)
248+
}
249+
250+
override SensitiveDataClassification getClassification() { result = classification }
251+
}
252+
253+
/** A parameter where the name indicates it will receive sensitive data. */
254+
class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
255+
SensitiveDataClassification classification;
256+
257+
SensitiveParameter() {
258+
nameIndicatesSensitiveData(this.getParameter().getName(), classification)
259+
}
260+
261+
override SensitiveDataClassification getClassification() { result = classification }
262+
}
66263
}
264+
265+
predicate sensitiveDataExtraStepForCalls = SensitiveDataModeling::extraStepForCalls/2;

python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking
1313
private import semmle.python.Concepts
1414
private import semmle.python.dataflow.new.RemoteFlowSources
1515
private import semmle.python.dataflow.new.BarrierGuards
16+
private import semmle.python.dataflow.new.SensitiveDataSources
1617

1718
/**
1819
* Provides a taint-tracking configuration for detecting use of a broken or weak
@@ -38,6 +39,10 @@ module NormalHashFunction {
3839
or
3940
node instanceof Sanitizer
4041
}
42+
43+
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
44+
sensitiveDataExtraStepForCalls(node1, node2)
45+
}
4146
}
4247
}
4348

@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {
7075
or
7176
node instanceof Sanitizer
7277
}
78+
79+
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
80+
sensitiveDataExtraStepForCalls(node1, node2)
81+
}
7382
}
7483
}

python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,9 @@ module NormalHashFunction {
5252
* A source of sensitive data, considered as a flow source.
5353
*/
5454
class SensitiveDataSourceAsSource extends Source, SensitiveDataSource {
55-
override string getClassification() { result = SensitiveDataSource.super.getClassification() }
55+
override SensitiveDataClassification getClassification() {
56+
result = SensitiveDataSource.super.getClassification()
57+
}
5658
}
5759

5860
/** The input to a hashing operation using a weak algorithm, considered as a flow sink. */
@@ -120,12 +122,12 @@ module ComputationallyExpensiveHashFunction {
120122
*/
121123
class PasswordSourceAsSource extends Source, SensitiveDataSource {
122124
PasswordSourceAsSource() {
123-
// TODO: once https://github.com/github/codeql/pull/5739 has been merged,
124-
// don't use hardcoded value anymore
125-
SensitiveDataSource.super.getClassification() = "password"
125+
SensitiveDataSource.super.getClassification() = SensitiveDataClassification::password()
126126
}
127127

128-
override string getClassification() { result = SensitiveDataSource.super.getClassification() }
128+
override SensitiveDataClassification getClassification() {
129+
result = SensitiveDataSource.super.getClassification()
130+
}
129131
}
130132

131133
/**
Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
1+
// /**
2+
// * @kind path-problem
3+
// */
14
import python
25
import semmle.python.dataflow.new.DataFlow
6+
import semmle.python.dataflow.new.TaintTracking
37
import TestUtilities.InlineExpectationsTest
48
import semmle.python.dataflow.new.SensitiveDataSources
9+
private import semmle.python.ApiGraphs
510

611
class SensitiveDataSourcesTest extends InlineExpectationsTest {
712
SensitiveDataSourcesTest() { this = "SensitiveDataSourcesTest" }
813

9-
override string getARelevantTag() { result = "SensitiveDataSource" }
14+
override string getARelevantTag() { result in ["SensitiveDataSource", "SensitiveUse"] }
1015

1116
override predicate hasActualResult(Location location, string element, string tag, string value) {
1217
exists(location.getFile().getRelativePath()) and
@@ -15,6 +20,32 @@ class SensitiveDataSourcesTest extends InlineExpectationsTest {
1520
element = source.toString() and
1621
value = source.getClassification() and
1722
tag = "SensitiveDataSource"
23+
or
24+
exists(DataFlow::Node use |
25+
any(SensitiveUseConfiguration config).hasFlow(source, use) and
26+
location = use.getLocation() and
27+
element = use.toString() and
28+
value = source.getClassification() and
29+
tag = "SensitiveUse"
30+
)
1831
)
1932
}
2033
}
34+
35+
class SensitiveUseConfiguration extends TaintTracking::Configuration {
36+
SensitiveUseConfiguration() { this = "SensitiveUseConfiguration" }
37+
38+
override predicate isSource(DataFlow::Node node) { node instanceof SensitiveDataSource }
39+
40+
override predicate isSink(DataFlow::Node node) {
41+
node = API::builtin("print").getACall().getArg(_)
42+
}
43+
44+
override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
45+
sensitiveDataExtraStepForCalls(node1, node2)
46+
}
47+
}
48+
// import DataFlow::PathGraph
49+
// from SensitiveUseConfiguration cfg, DataFlow::PathNode source, DataFlow::PathNode sink
50+
// where cfg.hasFlowPath(source, sink)
51+
// select sink, source, sink, "taint from $@", source.getNode(), "here"

0 commit comments

Comments
 (0)