Python: Fix bad join order for sensitive data

tausbn · web-flow · commit 1decf2378504 · 2021-07-12T16:10:49.000Z
Not the prettiest of solutions, but it does the job. Basically, we were
calculating (and re-calculating) the same big relation between strings
and regexes and then checking whether the latter matched the former.

This resulted in tuple counts like the following:

```
[2021-07-12 16:09:24] (12s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::SensitiveVariableAssignment#class#ff#shared/4@7489c6:
4918074 ~0%     {4} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH Flow::NameNode::getId_dispred#ff CARTESIAN PRODUCT OUTPUT Lhs.0 'arg0', Lhs.1 'arg1', Rhs.0, Rhs.1 'arg3'
2654    ~0%     {4} r2 = JOIN r1 WITH PRIMITIVE regexpMatch#bb ON Lhs.3 'arg3',Lhs.1 'arg1'
                return r2
```
(The above being just the bit that handles `DefinitionNode` in
`SensitiveVariableAssignment`, and taking 12 seconds to evaluate.)

By applying a bit of manual inlining and magic, this becomes somewhat
more manageable:

```
[2021-07-12 15:59:44] (1s) Tuple counts for SensitiveDataSources::SensitiveDataModeling::sensitiveString#ff/2@8830e2:
27671  ~2%      {3} r1 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveParameterName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

334012 ~2%      {3} r2 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

361683 ~11%     {3} r3 = r1 UNION r2

154644 ~0%      {3} r4 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveFunctionName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

149198 ~1%      {3} r5 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveStrConst#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

124257 ~5%      {3} r6 = JOIN SensitiveDataHeuristics::HeuristicNames::maybeSensitiveRegexp#ff WITH SensitiveDataSources::SensitiveDataModeling::sensitiveAttributeName#f CARTESIAN PRODUCT OUTPUT Lhs.0 'classification', Lhs.1, Rhs.0

273455 ~21%     {3} r7 = r5 UNION r6
428099 ~30%     {3} r8 = r4 UNION r7
789782 ~78%     {3} r9 = r3 UNION r8
1121   ~77%     {3} r10 = JOIN r9 WITH PRIMITIVE regexpMatch#bb ON Lhs.2 'result',Lhs.1
1121   ~70%     {2} r11 = SCAN r10 OUTPUT In.0 'classification', In.2 'result'
                return r11
```
(The above being the total for all the sensitive names we care about,
taking only 1.2 seconds to evaluate.)

Incidentally, you may wonder why this has _fewer_ results than before.
The answer is control flow splitting -- every sensitively-named
`DefinitionNode` would have been matched in isolation previously. By
pre-matching on just the names of these, we can subsequently join
against those names that are known to be sensitive, which is a much
faster operation.

(We also get the benefit of deduplicating the strings that are matched,
before actually performing the match, so if, say, an attribute name and
a variable name are identical, then we'll only match them once.)

We also exclude all docstrings as relevant string constants, as these
presumably don't actually flow anywhere.
diff --git a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -60,7 +60,7 @@ private module SensitiveDataModeling {
   ) {
     t.start() and
     exists(Function f |
-      nameIndicatesSensitiveData(f.getName(), classification) and
+      f.getName() = sensitiveString(classification) and
       result.asExpr() = f.getDefinition()
     )
     or
@@ -83,7 +83,7 @@ private module SensitiveDataModeling {
     // Note: If this is implemented with type-tracking, we will get cross-talk as
     // illustrated in python/ql/test/experimental/dataflow/sensitive-data/test.py
     exists(DataFlow::LocalSourceNode source |
-      nameIndicatesSensitiveData(source.asExpr().(StrConst).getText(), classification) and
+      source.asExpr().(StrConst).getText() = sensitiveString(classification) and
       source.flowsTo(result)
     )
   }
@@ -97,7 +97,7 @@ private module SensitiveDataModeling {
       or
       // to cover functions that we don't have the definition for, and where the
       // reference to the function has not already been marked as being sensitive
-      nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
+      this.getFunction().asCfgNode().(NameNode).getId() = sensitiveString(classification)
     }
 
     override SensitiveDataClassification getClassification() { result = classification }
@@ -164,6 +164,46 @@ private module SensitiveDataModeling {
     nodeFrom = possibleSensitiveCallable()
   }
 
+  pragma[nomagic]
+  private string sensitiveStrConstCandidate() {
+    result = any(StrConst s | not s.isDocString()).getText() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveAttributeNameCandidate() {
+    result = any(DataFlow::AttrRead a).getAttributeName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveParameterNameCandidate() {
+    result = any(Parameter p).getName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveFunctionNameCandidate() {
+    result = any(Function f).getName() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveNameCandidate() {
+    result = any(Name n).getId() and
+    not result.regexpMatch(notSensitiveRegexp())
+  }
+
+  pragma[nomagic]
+  private string sensitiveString(SensitiveDataClassification classification) {
+    result in [
+        sensitiveNameCandidate(), sensitiveAttributeNameCandidate(),
+        sensitiveParameterNameCandidate(), sensitiveFunctionNameCandidate(),
+        sensitiveStrConstCandidate()
+      ] and
+    result.regexpMatch(maybeSensitiveRegexp(classification))
+  }
+
   /**
    * Any kind of variable assignment (also including with/for) where the name indicates
    * it contains sensitive data.
@@ -182,7 +222,7 @@ private module SensitiveDataModeling {
 
     SensitiveVariableAssignment() {
       exists(DefinitionNode def |
-        nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
+        def.(NameNode).getId() = sensitiveString(classification) and
         (
           this.asCfgNode() = def.getValue()
           or
@@ -193,7 +233,7 @@ private module SensitiveDataModeling {
       )
       or
       exists(With with |
-        nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
+        with.getOptionalVars().(Name).getId() = sensitiveString(classification) and
         this.asExpr() = with.getContextExpr()
       )
     }
@@ -209,7 +249,7 @@ private module SensitiveDataModeling {
       // Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
       // I considered excluding any `from ... import something_sensitive`, but then realized that
       // we should flag up `form ... import password as ...` as a password
-      nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
+      this.(DataFlow::AttrRead).getAttributeName() = sensitiveString(classification)
       or
       // Things like `getattr(foo, <reference-to-string>)`
       this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
@@ -246,9 +286,7 @@ private module SensitiveDataModeling {
   class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
     SensitiveDataClassification classification;
 
-    SensitiveParameter() {
-      nameIndicatesSensitiveData(this.getParameter().getName(), classification)
-    }
+    SensitiveParameter() { this.getParameter().getName() = sensitiveString(classification) }
 
     override SensitiveDataClassification getClassification() { result = classification }
   }