Skip to content

Commit a73e382

Browse files
authored
Python: Prevent bad join in hashlib model
I'm not entirely sure what triggered this bad join order, but some combination of the use of abstract classes and the exclusion of `new` caused this to go really wrong: ``` WeakSensitiveDataHashing.ql-15:Stdlib::Stdlib::HashlibDataPassedToHashClass#class#ffff ......... 15.5s ``` with the following tuple counts: ``` [2021-07-12 13:20:15] (16s) Tuple counts for Stdlib::Stdlib::HashlibDataPassedToHashClass#class#ffff/4@217901: 148810 ~3% {3} r1 = JOIN DataFlowPublic::CallCfgNode#class#ff#shared WITH project#DataFlowPublic::CallCfgNode::getArg_dispred#fff ON FIRST 1 OUTPUT "hashlib", Lhs.1 'node', Lhs.0 'this' 148810 ~4% {3} r2 = JOIN r1 WITH ApiGraphs::API::Impl::MkModuleImport#ff@staged_ext ON FIRST 1 OUTPUT Rhs.1, Lhs.1 'node', Lhs.2 'this' 7589310 ~486% {4} r3 = JOIN r2 WITH ApiGraphs::API::Impl::edge#2#fff@staged_ext ON FIRST 1 OUTPUT Lhs.1 'node', Lhs.2 'this', Rhs.1, InverseAppend("getMember(\"","\")",Rhs.1) 6994070 ~490% {4} r4 = SELECT r3 ON In.3 != "new" 6994070 ~4503% {2} r5 = SCAN r4 OUTPUT In.1 'this', In.0 'node' 22 ~4% {3} r6 = JOIN DataFlowPublic::CallCfgNode#class#ff#shared WITH project#DataFlowPublic::CallCfgNode::getArgByName_dispred#fff ON FIRST 1 OUTPUT "hashlib", Lhs.1 'node', Lhs.0 'this' 22 ~0% {3} r7 = JOIN r6 WITH ApiGraphs::API::Impl::MkModuleImport#ff@staged_ext ON FIRST 1 OUTPUT Rhs.1, Lhs.1 'node', Lhs.2 'this' 1122 ~437% {4} r8 = JOIN r7 WITH ApiGraphs::API::Impl::edge#2#fff@staged_ext ON FIRST 1 OUTPUT Lhs.1 'node', Lhs.2 'this', Rhs.1, InverseAppend("getMember(\"","\")",Rhs.1) 1034 ~460% {4} r9 = SELECT r8 ON In.3 != "new" 1034 ~4549% {2} r10 = SCAN r9 OUTPUT In.1 'this', In.0 'node' 6995104 ~4503% {2} r11 = r5 UNION r10 5213851 ~4683% {3} r12 = JOIN r11 WITH ApiGraphs::API::Node::getACall_dispred#ff_10#join_rhs ON FIRST 1 OUTPUT Rhs.1 'hashClass', Lhs.1 'node', Lhs.0 'this' 6478480 ~4646% {6} r13 = JOIN r12 WITH ApiGraphs::API::Impl::edge#2#fff_201#join_rhs ON FIRST 1 OUTPUT "hashlib", Rhs.1, Lhs.1 'node', Lhs.2 'this', Lhs.0 'hashClass', Rhs.2 1410 ~4693% {5} r14 = JOIN r13 WITH ApiGraphs::API::Impl::MkModuleImport#ff@staged_ext ON FIRST 2 OUTPUT Lhs.2 'node', Lhs.3 'this', Lhs.4 'hashClass', Lhs.5, InverseAppend("getMember(\"","\")",Lhs.5) 1222 ~4540% {5} r15 = SELECT r14 ON In.4 'hashName' != "new" 1222 ~4540% {4} r16 = SCAN r15 OUTPUT In.1 'this', In.4 'hashName', In.2 'hashClass', In.0 'node' ``` By factoring out the insides, the biggest iteration now looks like ``` [2021-07-12 14:17:36] (0s) Tuple counts for Stdlib::Stdlib::HashlibDataPassedToHashClass#class#ffff/4@85bb21: 148810 ~0% {2} r1 = JOIN DataFlowPublic::CallCfgNode#class#ff#shared WITH project#DataFlowPublic::CallCfgNode::getArg_dispred#fff ON FIRST 1 OUTPUT Lhs.1 'node', Lhs.0 'this' 148810 ~0% {2} r2 = JOIN r1 WITH Stdlib::Stdlib::hashlibMember#ff#nonempty CARTESIAN PRODUCT OUTPUT Lhs.1 'this', Lhs.0 'node' 22 ~0% {2} r3 = JOIN DataFlowPublic::CallCfgNode#class#ff#shared WITH project#DataFlowPublic::CallCfgNode::getArgByName_dispred#fff ON FIRST 1 OUTPUT Lhs.1 'node', Lhs.0 'this' 22 ~0% {2} r4 = JOIN r3 WITH Stdlib::Stdlib::hashlibMember#ff#nonempty CARTESIAN PRODUCT OUTPUT Lhs.1 'this', Lhs.0 'node' 148832 ~0% {2} r5 = r2 UNION r4 110933 ~2% {3} r6 = JOIN r5 WITH ApiGraphs::API::Node::getACall_dispred#ff_10#join_rhs ON FIRST 1 OUTPUT Rhs.1 'hashClass', Lhs.1 'node', Lhs.0 'this' 26 ~0% {4} r7 = JOIN r6 WITH Stdlib::Stdlib::hashlibMember#ff_10#join_rhs ON FIRST 1 OUTPUT Lhs.2 'this', Rhs.1 'hashName', Lhs.0 'hashClass', Lhs.1 'node' return r7 ``` (The tuple counts themselves are not directly comparable.)
1 parent c47d680 commit a73e382

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

python/ql/src/semmle/python/frameworks/Stdlib.qll

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,6 +1207,13 @@ private module Stdlib {
12071207
override DataFlow::Node getAnInput() { result = this.getArg(0) }
12081208
}
12091209

1210+
/** Helper predicate for the `HashLibGenericHashOperation` charpred, to prevent a bad join order. */
1211+
pragma[nomagic]
1212+
private API::Node hashlibMember(string hashName) {
1213+
result = API::moduleImport("hashlib").getMember(hashName) and
1214+
hashName != "new"
1215+
}
1216+
12101217
/**
12111218
* A hashing operation from the `hashlib` package using one of the predefined classes
12121219
* (such as `hashlib.md5`). `hashlib.new` is not included, since it is handled by
@@ -1218,10 +1225,7 @@ private module Stdlib {
12181225
API::Node hashClass;
12191226

12201227
bindingset[this]
1221-
HashlibGenericHashOperation() {
1222-
not hashName = "new" and
1223-
hashClass = API::moduleImport("hashlib").getMember(hashName)
1224-
}
1228+
HashlibGenericHashOperation() { hashClass = hashlibMember(hashName) }
12251229

12261230
override Cryptography::CryptographicAlgorithm getAlgorithm() { result.matchesName(hashName) }
12271231
}

0 commit comments

Comments
 (0)