Skip to content

Commit 1decf23

Browse files
authored
Python: Fix bad join order for sensitive data
Not the prettiest of solutions, but it does the job. Basically, we were calculating (and re-calculating) the same big relation between strings and regexes and then checking whether the latter matched the former. This resulted in tuple counts like the following: ``` [2021-07-12 16:09:24] (12s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::SensitiveVariableAssignment#class#ff#shared/4@7489c6: 4918074 ~0% {4} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH Flow::NameNode::getId_dispred#ff CARTESIAN PRODUCT OUTPUT Lhs.0 'arg0', Lhs.1 'arg1', Rhs.0, Rhs.1 'arg3' 2654 ~0% {4} r2 = JOIN r1 WITH PRIMITIVE regexpMatch#bb ON Lhs.3 'arg3',Lhs.1 'arg1' return r2 ``` (The above being just the bit that handles `DefinitionNode` in `SensitiveVariableAssignment`, and taking 12 seconds to evaluate.) By applying a bit of manual inlining and magic, this becomes somewhat more manageable: ``` [2021-07-12 15:59:44] (1s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::sensitiveString#ff/2@8830e2: 27671 ~2% {3} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveParameterName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 334012 ~2% {3} r2 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 361683 ~11% {3} r3 = r1 UNION r2 154644 ~0% {3} r4 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveFunctionName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 149198 ~1% {3} r5 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveStrConst#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 124257 ~5% {3} r6 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveAttributeName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0 273455 ~21% {3} r7 = r5 UNION r6 428099 ~30% {3} r8 = r4 UNION r7 789782 ~78% {3} r9 = r3 UNION r8 1121 ~77% {3} r10 = JOIN r9 WITH PRIMITIVE regexpMatch#bb ON Lhs.2 'result',Lhs.1 1121 ~70% {2} r11 = SCAN r10 OUTPUT In.0 'classification', In.2 'result' return r11 ``` (The above being the total for all the sensitive names we care about, taking only 1.2 seconds to evaluate.) Incidentally, you may wonder why this has _fewer_ results than before. The answer is control flow splitting -- every sensitively-named `DefinitionNode` would have been matched in isolation previously. By pre-matching on just the names of these, we can subsequently join against those names that are known to be sensitive, which is a much faster operation. (We also get the benefit of deduplicating the strings that are matched, before actually performing the match, so if, say, an attribute name and a variable name are identical, then we'll only match them once.) We also exclude all docstrings as relevant string constants, as these presumably don't actually flow anywhere.
1 parent a73e382 commit 1decf23

File tree

1 file changed

+47
-9
lines changed

1 file changed

+47
-9
lines changed

python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
6060
) {
6161
t.start() and
6262
exists(Function f |
63-
nameIndicatesSensitiveData(f.getName(), classification) and
63+
f.getName() = sensitiveString(classification) and
6464
result.asExpr() = f.getDefinition()
6565
)
6666
or
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
8383
// Note: If this is implemented with type-tracking, we will get cross-talk as
8484
// illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
8585
exists(DataFlow::LocalSourceNode source |
86-
nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
86+
source.asExpr().(StrConst).getText() = sensitiveString(classification) and
8787
source.flowsTo(result)
8888
)
8989
}
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
9797
or
9898
// to cover functions that we don't have the definition for, and where the
9999
// reference to the function has not already been marked as being sensitive
100-
nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
100+
this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
101101
}
102102

103103
override SensitiveDataClassification getClassification() { result = classification }
@@ -164,6 +164,46 @@ private module SensitiveDataModeling {
164164
nodeFrom = possibleSensitiveCallable()
165165
}
166166

167+
pragma[nomagic]
168+
private string sensitiveStrConstCandidate() {
169+
result = any(StrConst s | not s.isDocString()).getText() and
170+
not result.regexpMatch(notSensitiveRegexp())
171+
}
172+
173+
pragma[nomagic]
174+
private string sensitiveAttributeNameCandidate() {
175+
result = any(DataFlow::AttrRead a).getAttributeName() and
176+
not result.regexpMatch(notSensitiveRegexp())
177+
}
178+
179+
pragma[nomagic]
180+
private string sensitiveParameterNameCandidate() {
181+
result = any(Parameter p).getName() and
182+
not result.regexpMatch(notSensitiveRegexp())
183+
}
184+
185+
pragma[nomagic]
186+
private string sensitiveFunctionNameCandidate() {
187+
result = any(Function f).getName() and
188+
not result.regexpMatch(notSensitiveRegexp())
189+
}
190+
191+
pragma[nomagic]
192+
private string sensitiveNameCandidate() {
193+
result = any(Name n).getId() and
194+
not result.regexpMatch(notSensitiveRegexp())
195+
}
196+
197+
pragma[nomagic]
198+
private string sensitiveString(SensitiveDataClassification classification) {
199+
result in [
200+
sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
201+
sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
202+
sensitiveStrConstCandidate()
203+
] and
204+
result.regexpMatch(maybeSensitiveRegexp(classification))
205+
}
206+
167207
/**
168208
* Any kind of variable assignment (also including with/for) where the name indicates
169209
* it contains sensitive data.
@@ -182,7 +222,7 @@ private module SensitiveDataModeling {
182222

183223
SensitiveVariableAssignment() {
184224
exists(DefinitionNode def |
185-
nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
225+
def.(NameNode).getId() = sensitiveString(classification) and
186226
(
187227
this.asCfgNode() = def.getValue()
188228
or
@@ -193,7 +233,7 @@ private module SensitiveDataModeling {
193233
)
194234
or
195235
exists(With with |
196-
nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
236+
with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
197237
this.asExpr() = with.getContextExpr()
198238
)
199239
}
@@ -209,7 +249,7 @@ private module SensitiveDataModeling {
209249
// Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
210250
// I considered excluding any `from ... import something_sensitive`, but then realized that
211251
// we should flag up `form ... import password as ...` as a password
212-
nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
252+
this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
213253
or
214254
// Things like `getattr(foo, <reference-to-string>)`
215255
this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
@@ -246,9 +286,7 @@ private module SensitiveDataModeling {
246286
class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
247287
SensitiveDataClassification classification;
248288

249-
SensitiveParameter() {
250-
nameIndicatesSensitiveData(this.getParameter().getName(), classification)
251-
}
289+
SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }
252290

253291
override SensitiveDataClassification getClassification() { result = classification }
254292
}

0 commit comments

Comments
 (0)