Skip to content

Commit 900cbc9

Browse files
authored
Merge pull request github#6265 from tausbn/python-performance-fixes
Python: Fix a few performance issues.
2 parents 5b7c2d1 + 30d6104 commit 900cbc9

File tree

2 files changed

+77
-13
lines changed

2 files changed

+77
-13
lines changed

python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll

Lines changed: 69 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
6060
) {
6161
t.start() and
6262
exists(Function f |
63-
nameIndicatesSensitiveData(f.getName(), classification) and
63+
f.getName() = sensitiveString(classification) and
6464
result.asExpr() = f.getDefinition()
6565
)
6666
or
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
8383
// Note: If this is implemented with type-tracking, we will get cross-talk as
8484
// illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
8585
exists(DataFlow::LocalSourceNode source |
86-
nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
86+
source.asExpr().(StrConst).getText() = sensitiveString(classification) and
8787
source.flowsTo(result)
8888
)
8989
}
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
9797
or
9898
// to cover functions that we don't have the definition for, and where the
9999
// reference to the function has not already been marked as being sensitive
100-
nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
100+
this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
101101
}
102102

103103
override SensitiveDataClassification getClassification() { result = classification }
@@ -164,6 +164,68 @@ private module SensitiveDataModeling {
164164
nodeFrom = possibleSensitiveCallable()
165165
}
166166

167+
pragma[nomagic]
168+
private string sensitiveStrConstCandidate() {
169+
result = any(StrConst s | not s.isDocString()).getText() and
170+
not result.regexpMatch(notSensitiveRegexp())
171+
}
172+
173+
pragma[nomagic]
174+
private string sensitiveAttributeNameCandidate() {
175+
result = any(DataFlow::AttrRead a).getAttributeName() and
176+
not result.regexpMatch(notSensitiveRegexp())
177+
}
178+
179+
pragma[nomagic]
180+
private string sensitiveParameterNameCandidate() {
181+
result = any(Parameter p).getName() and
182+
not result.regexpMatch(notSensitiveRegexp())
183+
}
184+
185+
pragma[nomagic]
186+
private string sensitiveFunctionNameCandidate() {
187+
result = any(Function f).getName() and
188+
not result.regexpMatch(notSensitiveRegexp())
189+
}
190+
191+
pragma[nomagic]
192+
private string sensitiveNameCandidate() {
193+
result = any(Name n).getId() and
194+
not result.regexpMatch(notSensitiveRegexp())
195+
}
196+
197+
/**
198+
* This helper predicate serves to deduplicate the results of the preceding predicates. This
199+
* means that if, say, an attribute and a function parameter have the same name, then that name will
200+
* only be matched once, which greatly cuts down on the number of regexp matches that have to be
201+
* performed.
202+
*
203+
* Under normal circumstances, deduplication is only performed when a predicate is materialized, and
204+
* so to see the effect of this we must create a separate predicate that calculates the union of the
205+
* preceding predicates.
206+
*/
207+
pragma[nomagic]
208+
private string sensitiveStringCandidate() {
209+
result in [
210+
sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
211+
sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
212+
sensitiveStrConstCandidate()
213+
]
214+
}
215+
216+
/**
217+
* Returns strings (primarily the names of various program entities) that may contain sensitive data
218+
* with the classification `classification`.
219+
*
220+
* This helper predicate ends up being very similar to `nameIndicatesSensitiveData`,
221+
* but is performance optimized to limit the number of regexp matches that have to be performed.
222+
*/
223+
pragma[nomagic]
224+
private string sensitiveString(SensitiveDataClassification classification) {
225+
result = sensitiveStringCandidate() and
226+
result.regexpMatch(maybeSensitiveRegexp(classification))
227+
}
228+
167229
/**
168230
* Any kind of variable assignment (also including with/for) where the name indicates
169231
* it contains sensitive data.
@@ -182,7 +244,7 @@ private module SensitiveDataModeling {
182244

183245
SensitiveVariableAssignment() {
184246
exists(DefinitionNode def |
185-
nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
247+
def.(NameNode).getId() = sensitiveString(classification) and
186248
(
187249
this.asCfgNode() = def.getValue()
188250
or
@@ -193,7 +255,7 @@ private module SensitiveDataModeling {
193255
)
194256
or
195257
exists(With with |
196-
nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
258+
with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
197259
this.asExpr() = with.getContextExpr()
198260
)
199261
}
@@ -209,7 +271,7 @@ private module SensitiveDataModeling {
209271
// Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
210272
// I considered excluding any `from ... import something_sensitive`, but then realized that
211273
// we should flag up `form ... import password as ...` as a password
212-
nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
274+
this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
213275
or
214276
// Things like `getattr(foo, <reference-to-string>)`
215277
this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
@@ -246,9 +308,7 @@ private module SensitiveDataModeling {
246308
class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
247309
SensitiveDataClassification classification;
248310

249-
SensitiveParameter() {
250-
nameIndicatesSensitiveData(this.getParameter().getName(), classification)
251-
}
311+
SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }
252312

253313
override SensitiveDataClassification getClassification() { result = classification }
254314
}

python/ql/src/semmle/python/frameworks/Stdlib.qll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,13 @@ private module Stdlib {
12071207
override DataFlow::Node getAnInput() { result = this.getArg(0) }
12081208
}
12091209

1210+
/** Helper predicate for the `HashLibGenericHashOperation` charpred, to prevent a bad join order. */
1211+
pragma[nomagic]
1212+
private API::Node hashlibMember(string hashName) {
1213+
result = API::moduleImport("hashlib").getMember(hashName) and
1214+
hashName != "new"
1215+
}
1216+
12101217
/**
12111218
* A hashing operation from the `hashlib` package using one of the predefined classes
12121219
* (such as `hashlib.md5`). `hashlib.new` is not included, since it is handled by
@@ -1218,10 +1225,7 @@ private module Stdlib {
12181225
API::Node hashClass;
12191226

12201227
bindingset[this]
1221-
HashlibGenericHashOperation() {
1222-
not hashName = "new" and
1223-
hashClass = API::moduleImport("hashlib").getMember(hashName)
1224-
}
1228+
HashlibGenericHashOperation() { hashClass = hashlibMember(hashName) }
12251229

12261230
override Cryptography::CryptographicAlgorithm getAlgorithm() { result.matchesName(hashName) }
12271231
}

0 commit comments

Comments
 (0)