Python: Better handling of sensitive functions

RasmusWL · RasmusWL · commit ea0c1d7db3da · 2021-06-10T15:08:21.000+02:00
This solution was the best I could come up with, but it _is_ a bit
brittle since you need to remember to add this additional taint step
to any configuration that relies on sensitive data sources... I don't
see an easy way around this though :|
diff --git a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -93,6 +93,8 @@ private module SensitiveDataModeling {
   /**
    * Gets a reference to a string constant that, if used as the key in a lookup,
    * indicates the presence of sensitive data with `classification`.
+   *
+   * Also see `extraStepForCalls`.
    */
   DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) {
     sensitiveLookupStringConst(DataFlow::TypeTracker::end(), classification).flowsTo(result)
@@ -105,12 +107,49 @@ private module SensitiveDataModeling {
     SensitiveFunctionCall() {
       this.getFunction() = sensitiveFunction(classification)
       or
+      // to cover functions that we don't have the definition for, and where the
+      // reference to the function has not already been marked as being sensitive
       nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
     }
 
     override SensitiveDataClassification getClassification() { result = classification }
   }
 
+  /**
+   * Holds if the step from `nodeFrom` to `nodeTo` should be considered a
+   * taint-flow step for sensitive-data, to ensure calls are handled correctly.
+   *
+   * To handle calls properly, while preserving a good source for path explanations,
+   * you need to include this predicate as an additional taint step in your taint-tracking
+   * configurations.
+   *
+   * The core problem can be illustrated by the example below. If we consider the
+   * `print` a sink, what path and what source do we want to show? My initial approach
+   * would be to use type-tracking to propagate from the `not_found.get_passwd` attribute
+   * lookup, to the use of `non_sensitive_name`, and then create a new `SensitiveDataSource::Range`
+   * like `SensitiveFunctionCall`. Although that seems likely to work, it will also end up
+   * with a non-optimal path, which starts at _bad source_, and therefore doesn't show
+   * how we figured out that `non_sensitive_name`
+   * could be a function that returns a password (and in cases where there is many calls to
+   * `my_func` it will be annoying for someone to figure this out manually).
+   *
+   * By including this additional taint-step in the taint-tracking configuration, it's possible
+   * to get a path explanation going from _good source_ to the sink.
+   *
+   * ```python
+   * def my_func(non_sensitive_name):
+   *     x = non_sensitive_name() # <-- bad source
+   *     print(x) # <-- sink
+   *
+   * import not_found
+   * f = not_found.get_passwd # <-- good source
+   * my_func(f)
+   * ```
+   */
+  predicate extraStepForCalls(DataFlow::Node nodeFrom, DataFlow::CallCfgNode nodeTo) {
+    nodeTo.getFunction() = nodeFrom
+  }
+
   /**
    * Any kind of variable assignment (also including with/for) where the name indicates
    * it contains sensitive data.
@@ -200,3 +239,5 @@ private module SensitiveDataModeling {
     override SensitiveDataClassification getClassification() { result = classification }
   }
 }
+
+predicate sensitiveDataExtraStepForCalls = SensitiveDataModeling::extraStepForCalls/2;
diff --git a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking
 private import semmle.python.Concepts
 private import semmle.python.dataflow.new.RemoteFlowSources
 private import semmle.python.dataflow.new.BarrierGuards
+private import semmle.python.dataflow.new.SensitiveDataSources
 
 /**
  * Provides a taint-tracking configuration for detecting use of a broken or weak
@@ -38,6 +39,10 @@ module NormalHashFunction {
       or
       node instanceof Sanitizer
     }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
   }
 }
 
@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {
       or
       node instanceof Sanitizer
     }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
   }
 }
diff --git a/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql b/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
@@ -40,6 +40,10 @@ class SensitiveUseConfiguration extends TaintTracking::Configuration {
   override predicate isSink(DataFlow::Node node) {
     node = API::builtin("print").getACall().getArg(_)
   }
+
+  override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+    sensitiveDataExtraStepForCalls(node1, node2)
+  }
 }
 // import DataFlow::PathGraph
 // from SensitiveUseConfiguration cfg, DataFlow::PathNode source, DataFlow::PathNode sink
diff --git a/python/ql/test/experimental/dataflow/sensitive-data/test.py b/python/ql/test/experimental/dataflow/sensitive-data/test.py
@@ -29,17 +29,17 @@ def encrypt_password(pwd):
 print(x) # $ SensitiveUse=password
 
 f = get_passwd
-x = f() # $ MISSING: SensitiveDataSource=password
-print(x) # $ MISSING: SensitiveUse=password
+x = f()
+print(x) # $ SensitiveUse=password
 
 import not_found
 f = not_found.get_passwd # $ SensitiveDataSource=password
-x = f() # $ MISSING: SensitiveDataSource=password
-print(x) # $ MISSING: SensitiveUse=password
+x = f()
+print(x) # $ SensitiveUse=password
 
 def my_func(non_sensitive_name):
-    x = non_sensitive_name() # $ MISSING: SensitiveDataSource=password
-    print(x) # $ MISSING: SensitiveUse=password
+    x = non_sensitive_name()
+    print(x) # $ SensitiveUse=password
 f = not_found.get_passwd # $ SensitiveDataSource=password
 my_func(f)
 

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking`
`13`	`13`	`private import semmle.python.Concepts`
`14`	`14`	`private import semmle.python.dataflow.new.RemoteFlowSources`
`15`	`15`	`private import semmle.python.dataflow.new.BarrierGuards`
	`16`	`+private import semmle.python.dataflow.new.SensitiveDataSources`
`16`	`17`
`17`	`18`	`/**`
`18`	`19`	`* Provides a taint-tracking configuration for detecting use of a broken or weak`
`@@ -38,6 +39,10 @@ module NormalHashFunction {`
`38`	`39`	`or`
`39`	`40`	`node instanceof Sanitizer`
`40`	`41`	`}`
	`42`	`+`
	`43`	`+ override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {`
	`44`	`+ sensitiveDataExtraStepForCalls(node1, node2)`
	`45`	`+ }`
`41`	`46`	`}`
`42`	`47`	`}`
`43`	`48`
`@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {`
`70`	`75`	`or`
`71`	`76`	`node instanceof Sanitizer`
`72`	`77`	`}`
	`78`	`+`
	`79`	`+ override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {`
	`80`	`+ sensitiveDataExtraStepForCalls(node1, node2)`
	`81`	`+ }`
`73`	`82`	`}`
`74`	`83`	`}`
Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,10 @@ class SensitiveUseConfiguration extends TaintTracking::Configuration {`
`40`	`40`	`override predicate isSink(DataFlow::Node node) {`
`41`	`41`	`node = API::builtin("print").getACall().getArg(_)`
`42`	`42`	`}`
	`43`	`+`
	`44`	`+ override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {`
	`45`	`+ sensitiveDataExtraStepForCalls(node1, node2)`
	`46`	`+ }`
`43`	`47`	`}`
`44`	`48`	`// import DataFlow::PathGraph`
`45`	`49`	`// from SensitiveUseConfiguration cfg, DataFlow::PathNode source, DataFlow::PathNode sink`