Merge pull request github#6013 from RasmusWL/sensitive-improvements

yoff · web-flow · commit b19d64f17315 · 2021-06-15T14:45:40.000+02:00
Python: Improve sensitive data modeling
diff --git a/python/change-notes/2021-06-04-sensitive-data-modeling-expanded.md b/python/change-notes/2021-06-04-sensitive-data-modeling-expanded.md
@@ -0,0 +1,2 @@
+lgtm,codescanning
+* Expanded modeling of sensitive data sources to include: subscripting with a key that indicates sensitive data (`obj["password"]`), parameters whose names indicate sensitive data (`def func(password):`), and assignments to variables whose names indicate sensitive data (`password = ...`).
diff --git a/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll b/python/ql/src/semmle/python/dataflow/new/SensitiveDataSources.qll
@@ -5,10 +5,14 @@
 
 private import python
 private import semmle.python.dataflow.new.DataFlow
-// Need to import since frameworks can extend `RemoteFlowSource::Range`
+// Need to import `semmle.python.Frameworks` since frameworks can extend `SensitiveDataSource::Range`
 private import semmle.python.Frameworks
-private import semmle.python.Concepts
-private import semmle.python.security.SensitiveData as OldSensitiveData
+private import semmle.python.security.internal.SensitiveDataHeuristics as SensitiveDataHeuristics
+
+// We export these explicitly, so we don't also export the `HeuristicNames` module.
+class SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;
+
+module SensitiveDataClassification = SensitiveDataHeuristics::SensitiveDataClassification;
 
 /**
  * A data flow source of sensitive data, such as secrets, certificates, or passwords.
@@ -22,13 +26,9 @@ class SensitiveDataSource extends DataFlow::Node {
   SensitiveDataSource() { this = range }
 
   /**
-   * INTERNAL: Do not use.
-   *
-   * This will be rewritten to have better types soon, and therefore should only be used internally until then.
-   *
    * Gets the classification of the sensitive data.
    */
-  string getClassification() { result = range.getClassification() }
+  SensitiveDataClassification getClassification() { result = range.getClassification() }
 }
 
 /** Provides a class for modeling new sources of sensitive data, such as secrets, certificates, or passwords. */
@@ -41,26 +41,225 @@ module SensitiveDataSource {
    */
   abstract class Range extends DataFlow::Node {
     /**
-     * INTERNAL: Do not use.
-     *
-     * This will be rewritten to have better types soon, and therefore should only be used internally until then.
-     *
      * Gets the classification of the sensitive data.
      */
-    abstract string getClassification();
+    abstract SensitiveDataClassification getClassification();
   }
 }
 
-private class PortOfOldModeling extends SensitiveDataSource::Range {
-  OldSensitiveData::SensitiveData::Source oldSensitiveSource;
+/** Actual sensitive data modeling */
+private module SensitiveDataModeling {
+  private import SensitiveDataHeuristics::HeuristicNames
 
-  PortOfOldModeling() { this.asCfgNode() = oldSensitiveSource }
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveFunction(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    exists(Function f |
+      nameIndicatesSensitiveData(f.getName(), classification) and
+      result.asExpr() = f.getDefinition()
+    )
+    or
+    exists(DataFlow::TypeTracker t2 | result = sensitiveFunction(t2, classification).track(t2, t))
+  }
 
-  override string getClassification() {
-    exists(OldSensitiveData::SensitiveData classification |
-      oldSensitiveSource.isSourceOf(classification)
-    |
-      classification = "sensitive.data." + result
+  /**
+   * Gets a reference to a function that is considered to be a sensitive source of
+   * `classification`.
+   */
+  DataFlow::Node sensitiveFunction(SensitiveDataClassification classification) {
+    sensitiveFunction(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   */
+  private DataFlow::LocalSourceNode sensitiveLookupStringConst(
+    DataFlow::TypeTracker t, SensitiveDataClassification classification
+  ) {
+    t.start() and
+    nameIndicatesSensitiveData(result.asExpr().(StrConst).getText(), classification)
+    or
+    exists(DataFlow::TypeTracker t2 |
+      result = sensitiveLookupStringConst(t2, classification).track(t2, t)
     )
   }
+
+  /**
+   * Gets a reference to a string constant that, if used as the key in a lookup,
+   * indicates the presence of sensitive data with `classification`.
+   *
+   * Also see `extraStepForCalls`.
+   */
+  DataFlow::Node sensitiveLookupStringConst(SensitiveDataClassification classification) {
+    sensitiveLookupStringConst(DataFlow::TypeTracker::end(), classification).flowsTo(result)
+  }
+
+  /** A function call that is considered a source of sensitive data. */
+  class SensitiveFunctionCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveFunctionCall() {
+      this.getFunction() = sensitiveFunction(classification)
+      or
+      // to cover functions that we don't have the definition for, and where the
+      // reference to the function has not already been marked as being sensitive
+      nameIndicatesSensitiveData(this.getFunction().asCfgNode().(NameNode).getId(), classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /**
+   * Tracks any modeled source of sensitive data (with any classification),
+   * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
+   */
+  private DataFlow::LocalSourceNode possibleSensitiveCallable(DataFlow::TypeTracker t) {
+    t.start() and
+    result instanceof SensitiveDataSource
+    or
+    exists(DataFlow::TypeTracker t2 | result = possibleSensitiveCallable(t2).track(t2, t))
+  }
+
+  /**
+   * Tracks any modeled source of sensitive data (with any classification),
+   * to limit the scope of `extraStepForCalls`. See it's QLDoc for more context.
+   */
+  private DataFlow::Node possibleSensitiveCallable() {
+    possibleSensitiveCallable(DataFlow::TypeTracker::end()).flowsTo(result)
+  }
+
+  /**
+   * Holds if the step from `nodeFrom` to `nodeTo` should be considered a
+   * taint-flow step for sensitive-data, to ensure calls are handled correctly.
+   *
+   * To handle calls properly, while preserving a good source for path explanations,
+   * you need to include this predicate as an additional taint step in your taint-tracking
+   * configurations.
+   *
+   * The core problem can be illustrated by the example below. If we consider the
+   * `print` a sink, what path and what source do we want to show? My initial approach
+   * would be to use type-tracking to propagate from the `not_found.get_passwd` attribute
+   * lookup, to the use of `non_sensitive_name`, and then create a new `SensitiveDataSource::Range`
+   * like `SensitiveFunctionCall`. Although that seems likely to work, it will also end up
+   * with a non-optimal path, which starts at _bad source_, and therefore doesn't show
+   * how we figured out that `non_sensitive_name`
+   * could be a function that returns a password (and in cases where there is many calls to
+   * `my_func` it will be annoying for someone to figure this out manually).
+   *
+   * By including this additional taint-step in the taint-tracking configuration, it's possible
+   * to get a path explanation going from _good source_ to the sink.
+   *
+   * ```python
+   * def my_func(non_sensitive_name):
+   *     x = non_sensitive_name() # <-- bad source
+   *     print(x) # <-- sink
+   *
+   * import not_found
+   * f = not_found.get_passwd # <-- good source
+   * my_func(f)
+   * ```
+   */
+  predicate extraStepForCalls(DataFlow::Node nodeFrom, DataFlow::CallCfgNode nodeTo) {
+    // However, we do still use the type-tracking approach to limit the size of this
+    // predicate.
+    nodeTo.getFunction() = nodeFrom and
+    nodeFrom = possibleSensitiveCallable()
+  }
+
+  /**
+   * Any kind of variable assignment (also including with/for) where the name indicates
+   * it contains sensitive data.
+   *
+   * Note: We _could_ make any access to a variable with a sensitive name a source of
+   * sensitive data, but to make path explanations in data-flow/taint-tracking good,
+   * we don't want that, since it works against allowing users to understand the flow
+   * in the program (which is the whole point).
+   *
+   * Note: To make data-flow/taint-tracking work, the expression that is _assigned_ to
+   * the variable is marked as the source (as compared to marking the variable as the
+   * source).
+   */
+  class SensitiveVariableAssignment extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveVariableAssignment() {
+      exists(DefinitionNode def |
+        nameIndicatesSensitiveData(def.(NameNode).getId(), classification) and
+        (
+          this.asCfgNode() = def.getValue()
+          or
+          this.asCfgNode() = def.getValue().(ForNode).getSequence()
+        ) and
+        not this.asExpr() instanceof FunctionExpr and
+        not this.asExpr() instanceof ClassExpr
+      )
+      or
+      exists(With with |
+        nameIndicatesSensitiveData(with.getOptionalVars().(Name).getId(), classification) and
+        this.asExpr() = with.getContextExpr()
+      )
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** An attribute access that is considered a source of sensitive data. */
+  class SensitiveAttributeAccess extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveAttributeAccess() {
+      // Things like `foo.<sensitive-name>` or `from <module> import <sensitive-name>`
+      // I considered excluding any `from ... import something_sensitive`, but then realized that
+      // we should flag up `form ... import password as ...` as a password
+      nameIndicatesSensitiveData(this.(DataFlow::AttrRead).getAttributeName(), classification)
+      or
+      // Things like `getattr(foo, <reference-to-string>)`
+      this.(DataFlow::AttrRead).getAttributeNameExpr() = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A subscript, where the key indicates the result will be sensitive data. */
+  class SensitiveSubscript extends SensitiveDataSource::Range {
+    SensitiveDataClassification classification;
+
+    SensitiveSubscript() {
+      this.asCfgNode().(SubscriptNode).getIndex() =
+        sensitiveLookupStringConst(classification).asCfgNode()
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A call to `get` on an object, where the key indicates the result will be sensitive data. */
+  class SensitiveGetCall extends SensitiveDataSource::Range, DataFlow::CallCfgNode {
+    SensitiveDataClassification classification;
+
+    SensitiveGetCall() {
+      this.getFunction().asCfgNode().(AttrNode).getName() = "get" and
+      this.getArg(0) = sensitiveLookupStringConst(classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
+
+  /** A parameter where the name indicates it will receive sensitive data. */
+  class SensitiveParameter extends SensitiveDataSource::Range, DataFlow::ParameterNode {
+    SensitiveDataClassification classification;
+
+    SensitiveParameter() {
+      nameIndicatesSensitiveData(this.getParameter().getName(), classification)
+    }
+
+    override SensitiveDataClassification getClassification() { result = classification }
+  }
 }
+
+predicate sensitiveDataExtraStepForCalls = SensitiveDataModeling::extraStepForCalls/2;
diff --git a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashing.qll
@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking
 private import semmle.python.Concepts
 private import semmle.python.dataflow.new.RemoteFlowSources
 private import semmle.python.dataflow.new.BarrierGuards
+private import semmle.python.dataflow.new.SensitiveDataSources
 
 /**
  * Provides a taint-tracking configuration for detecting use of a broken or weak
@@ -38,6 +39,10 @@ module NormalHashFunction {
       or
       node instanceof Sanitizer
     }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
   }
 }
 
@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {
       or
       node instanceof Sanitizer
     }
+
+    override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+      sensitiveDataExtraStepForCalls(node1, node2)
+    }
   }
 }
diff --git a/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll b/python/ql/src/semmle/python/security/dataflow/WeakSensitiveDataHashingCustomizations.qll
@@ -52,7 +52,9 @@ module NormalHashFunction {
    * A source of sensitive data, considered as a flow source.
    */
   class SensitiveDataSourceAsSource extends Source, SensitiveDataSource {
-    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataSource.super.getClassification()
+    }
   }
 
   /** The input to a hashing operation using a weak algorithm, considered as a flow sink. */
@@ -120,12 +122,12 @@ module ComputationallyExpensiveHashFunction {
    */
   class PasswordSourceAsSource extends Source, SensitiveDataSource {
     PasswordSourceAsSource() {
-      // TODO: once https://github.com/github/codeql/pull/5739 has been merged,
-      // don't use hardcoded value anymore
-      SensitiveDataSource.super.getClassification() = "password"
+      SensitiveDataSource.super.getClassification() = SensitiveDataClassification::password()
     }
 
-    override string getClassification() { result = SensitiveDataSource.super.getClassification() }
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataSource.super.getClassification()
+    }
   }
 
   /**
diff --git a/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql b/python/ql/test/experimental/dataflow/sensitive-data/TestSensitiveDataSources.ql
@@ -1,12 +1,17 @@
+// /**
+//  * @kind path-problem
+//  */
 import python
 import semmle.python.dataflow.new.DataFlow
+import semmle.python.dataflow.new.TaintTracking
 import TestUtilities.InlineExpectationsTest
 import semmle.python.dataflow.new.SensitiveDataSources
+private import semmle.python.ApiGraphs
 
 class SensitiveDataSourcesTest extends InlineExpectationsTest {
   SensitiveDataSourcesTest() { this = "SensitiveDataSourcesTest" }
 
-  override string getARelevantTag() { result = "SensitiveDataSource" }
+  override string getARelevantTag() { result in ["SensitiveDataSource", "SensitiveUse"] }
 
   override predicate hasActualResult(Location location, string element, string tag, string value) {
     exists(location.getFile().getRelativePath()) and
@@ -15,6 +20,32 @@ class SensitiveDataSourcesTest extends InlineExpectationsTest {
       element = source.toString() and
       value = source.getClassification() and
       tag = "SensitiveDataSource"
+      or
+      exists(DataFlow::Node use |
+        any(SensitiveUseConfiguration config).hasFlow(source, use) and
+        location = use.getLocation() and
+        element = use.toString() and
+        value = source.getClassification() and
+        tag = "SensitiveUse"
+      )
     )
   }
 }
+
+class SensitiveUseConfiguration extends TaintTracking::Configuration {
+  SensitiveUseConfiguration() { this = "SensitiveUseConfiguration" }
+
+  override predicate isSource(DataFlow::Node node) { node instanceof SensitiveDataSource }
+
+  override predicate isSink(DataFlow::Node node) {
+    node = API::builtin("print").getACall().getArg(_)
+  }
+
+  override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {
+    sensitiveDataExtraStepForCalls(node1, node2)
+  }
+}
+// import DataFlow::PathGraph
+// from SensitiveUseConfiguration cfg, DataFlow::PathNode source, DataFlow::PathNode sink
+// where cfg.hasFlowPath(source, sink)
+// select sink, source, sink, "taint from $@", source.getNode(), "here"
diff --git a/python/ql/test/experimental/dataflow/sensitive-data/test.py b/python/ql/test/experimental/dataflow/sensitive-data/test.py
diff --git a/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.expected b/python/ql/test/query-tests/Security/CWE-327-WeakSensitiveDataHashing/WeakSensitiveDataHashing.expected

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+lgtm,codescanning`
	`2`	+* Expanded modeling of sensitive data sources to include: subscripting with a key that indicates sensitive data (`obj["password"]`), parameters whose names indicate sensitive data (`def func(password):`), and assignments to variables whose names indicate sensitive data (`password = ...`).
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@ private import semmle.python.dataflow.new.TaintTracking`
`13`	`13`	`private import semmle.python.Concepts`
`14`	`14`	`private import semmle.python.dataflow.new.RemoteFlowSources`
`15`	`15`	`private import semmle.python.dataflow.new.BarrierGuards`
	`16`	`+private import semmle.python.dataflow.new.SensitiveDataSources`
`16`	`17`
`17`	`18`	`/**`
`18`	`19`	`* Provides a taint-tracking configuration for detecting use of a broken or weak`
`@@ -38,6 +39,10 @@ module NormalHashFunction {`
`38`	`39`	`or`
`39`	`40`	`node instanceof Sanitizer`
`40`	`41`	`}`
	`42`	`+`
	`43`	`+ override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {`
	`44`	`+ sensitiveDataExtraStepForCalls(node1, node2)`
	`45`	`+ }`
`41`	`46`	`}`
`42`	`47`	`}`
`43`	`48`
`@@ -70,5 +75,9 @@ module ComputationallyExpensiveHashFunction {`
`70`	`75`	`or`
`71`	`76`	`node instanceof Sanitizer`
`72`	`77`	`}`
	`78`	`+`
	`79`	`+ override predicate isAdditionalTaintStep(DataFlow::Node node1, DataFlow::Node node2) {`
	`80`	`+ sensitiveDataExtraStepForCalls(node1, node2)`
	`81`	`+ }`
`73`	`82`	`}`
`74`	`83`	`}`