Python: Extract SensitiveDataHeuristics to be shared with JS

RasmusWL · RasmusWL · commit 16b62486e970 · 2021-04-21T11:31:28.000+02:00
Initially I had called `nameIndicatesSensitiveData` for `maybeSensitiveName`,
which made the relationship with `maybeSensitive` and `notSensitive` quite
strange -- and therefore I added the more informative `maybeSensitiveRegexp` and
`notSensitiveRegexp`.

Although I'm no longer using `maybeSensitiveName`, and I no longer have a strong
argument for making this name change, I still like it. If someone thinks this is
a terrible idea, I'm happy to change it though 👍
diff --git a/config/identical-files.json b/config/identical-files.json
@@ -439,5 +439,9 @@
   "CryptoAlgorithms Python/JS": [
     "javascript/ql/src/semmle/javascript/security/CryptoAlgorithms.qll",
     "python/ql/src/semmle/crypto/Crypto.qll"
+  ],
+  "SensitiveDataHeuristics Python/JS": [
+    "javascript/ql/src/semmle/javascript/security/internal/SensitiveDataHeuristics.qll",
+    "python/ql/src/semmle/python/security/internal/SensitiveDataHeuristics.qll"
   ]
-}
+}
diff --git a/javascript/ql/src/semmle/javascript/security/internal/SensitiveDataHeuristics.qll b/javascript/ql/src/semmle/javascript/security/internal/SensitiveDataHeuristics.qll
@@ -0,0 +1,128 @@
+/**
+ * INTERNAL: Do not use.
+ *
+ * Provides classes and predicates for identifying strings that may indicate the presence of sensitive data.
+ * Such that we can share this logic across our CodeQL analysis of different languages.
+ *
+ * 'Sensitive' data in general is anything that should not be sent around in unencrypted form.
+ */
+
+/**
+ * A classification of different kinds of sensitive data:
+ *
+ *   - secret: generic secret or trusted data;
+ *   - id: a user name or other account information;
+ *   - password: a password or authorization key;
+ *   - certificate: a certificate.
+ *
+ * While classifications are represented as strings, this should not be relied upon.
+ * Instead, use the predicates in `SensitiveDataClassification::` to work with
+ * classifications.
+ */
+class SensitiveDataClassification extends string {
+  SensitiveDataClassification() { this in ["secret", "id", "password", "certificate"] }
+}
+
+/**
+ * Provides predicates to select the different kinds of sensitive data we support.
+ */
+module SensitiveDataClassification {
+  /** Gets the classification for secret or trusted data. */
+  SensitiveDataClassification secret() { result = "secret" }
+
+  /** Gets the classification for user names or other account information. */
+  SensitiveDataClassification id() { result = "id" }
+
+  /** Gets the classification for passwords or authorization keys. */
+  SensitiveDataClassification password() { result = "password" }
+
+  /** Gets the classification for certificates. */
+  SensitiveDataClassification certificate() { result = "certificate" }
+}
+
+/**
+ * INTERNAL: Do not use.
+ *
+ * Provides heuristics for identifying names related to sensitive information.
+ */
+module HeuristicNames {
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence of secret
+   * or trusted data.
+   */
+  string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
+
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence of
+   * user names or other account information.
+   */
+  string maybeAccountInfo() {
+    result = "(?is).*acc(ou)?nt.*" or
+    result = "(?is).*(puid|username|userid).*"
+  }
+
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence of
+   * a password or an authorization key.
+   */
+  string maybePassword() {
+    result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
+    result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
+  }
+
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence of
+   * a certificate.
+   */
+  string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name)).*" }
+
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence
+   * of sensitive data, with `classification` describing the kind of sensitive data involved.
+   */
+  string maybeSensitiveRegexp(SensitiveDataClassification classification) {
+    result = maybeSecret() and classification = SensitiveDataClassification::secret()
+    or
+    result = maybeAccountInfo() and classification = SensitiveDataClassification::id()
+    or
+    result = maybePassword() and classification = SensitiveDataClassification::password()
+    or
+    result = maybeCertificate() and
+    classification = SensitiveDataClassification::certificate()
+  }
+
+  /**
+   * Gets a regular expression that identifies strings that may indicate the presence of data
+   * that is hashed or encrypted, and hence rendered non-sensitive.
+   */
+  string notSensitiveRegexp() {
+    result = "(?is).*(redact|censor|obfuscate|hash|md5|sha|((?<!un)(en))?(crypt|code)).*"
+  }
+
+  /**
+   * DEPRECATED: Use `maybeSensitiveRegexp` instead.
+   * Only added to aid with internal rewrite
+   */
+  deprecated predicate maybeSensitive = maybeSensitiveRegexp/1;
+
+  /**
+   * DEPRECATED: Use `notSensitiveRegexp` instead.
+   * Only added to aid with internal rewrite
+   */
+  deprecated predicate notSensitive = notSensitiveRegexp/0;
+
+  /**
+   * Holds if `name` may indicate the presence of sensitive data, and
+   * `name` does not indicate the presence of data that is hashed or encrypted, which would have
+   * rendered the data non-sensitive. `classification` describes the kind of sensitive data involved.
+   *
+   * That is, one of the rexeps from `maybeSensitiveRegexp` matches `name` (with the
+   * given classification), and none of the regexps from `notSensitiveRegexp` matches
+   * `name`.
+   */
+  bindingset[name]
+  predicate nameIndicatesSensitiveData(string name, SensitiveDataClassification classification) {
+    name.regexpMatch(maybeSensitiveRegexp(classification)) and
+    not name.regexpMatch(notSensitiveRegexp())
+  }
+}
diff --git a/python/ql/src/semmle/python/security/SensitiveData.qll b/python/ql/src/semmle/python/security/SensitiveData.qll
@@ -12,105 +12,60 @@
 import python
 import semmle.python.dataflow.TaintTracking
 import semmle.python.web.HttpRequest
-
-/**
- * Provides heuristics for identifying names related to sensitive information.
- *
- * INTERNAL: Do not use directly.
- * This is copied from the javascript library, but should be language independent.
- */
-private module HeuristicNames {
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence of secret
-   * or trusted data.
-   */
-  string maybeSecret() { result = "(?is).*((?<!is)secret|(?<!un|is)trusted).*" }
-
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence of
-   * user names or other account information.
-   */
-  string maybeAccountInfo() {
-    result = "(?is).*acc(ou)?nt.*" or
-    result = "(?is).*(puid|username|userid).*"
-  }
-
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence of
-   * a password or an authorization key.
-   */
-  string maybePassword() {
-    result = "(?is).*pass(wd|word|code|phrase)(?!.*question).*" or
-    result = "(?is).*(auth(entication|ori[sz]ation)?)key.*"
-  }
-
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence of
-   * a certificate.
-   */
-  string maybeCertificate() { result = "(?is).*(cert)(?!.*(format|name)).*" }
-
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence
-   * of sensitive data, with `classification` describing the kind of sensitive data involved.
-   */
-  string maybeSensitive(SensitiveData data) {
-    result = maybeSecret() and data instanceof SensitiveData::Secret
-    or
-    result = maybeAccountInfo() and data instanceof SensitiveData::Id
-    or
-    result = maybePassword() and data instanceof SensitiveData::Password
-    or
-    result = maybeCertificate() and data instanceof SensitiveData::Certificate
-  }
-
-  /**
-   * Gets a regular expression that identifies strings that may indicate the presence of data
-   * that is hashed or encrypted, and hence rendered non-sensitive.
-   */
-  string notSensitive() {
-    result = "(?is).*(redact|censor|obfuscate|hash|md5|sha|((?<!un)(en))?(crypt|code)).*"
-  }
-
-  bindingset[name]
-  SensitiveData getSensitiveDataForName(string name) {
-    name.regexpMatch(HeuristicNames::maybeSensitive(result)) and
-    not name.regexpMatch(HeuristicNames::notSensitive())
-  }
-}
+import semmle.python.security.internal.SensitiveDataHeuristics
+private import HeuristicNames
 
 abstract class SensitiveData extends TaintKind {
   bindingset[this]
   SensitiveData() { this = this }
+
+  /** Gets the classification of this sensitive data taint kind. */
+  abstract SensitiveDataClassification getClassification();
 }
 
 module SensitiveData {
   class Secret extends SensitiveData {
     Secret() { this = "sensitive.data.secret" }
 
     override string repr() { result = "a secret" }
+
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataClassification::secret()
+    }
   }
 
   class Id extends SensitiveData {
     Id() { this = "sensitive.data.id" }
 
     override string repr() { result = "an ID" }
+
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataClassification::id()
+    }
   }
 
   class Password extends SensitiveData {
     Password() { this = "sensitive.data.password" }
 
     override string repr() { result = "a password" }
+
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataClassification::password()
+    }
   }
 
   class Certificate extends SensitiveData {
     Certificate() { this = "sensitive.data.certificate" }
 
     override string repr() { result = "a certificate or key" }
+
+    override SensitiveDataClassification getClassification() {
+      result = SensitiveDataClassification::certificate()
+    }
   }
 
   private SensitiveData fromFunction(Value func) {
-    result = HeuristicNames::getSensitiveDataForName(func.getName())
+    nameIndicatesSensitiveData(func.getName(), result.getClassification())
   }
 
   abstract class Source extends TaintSource {
@@ -134,7 +89,7 @@ module SensitiveData {
     SensitiveData data;
 
     SensitiveVariableAccess() {
-      data = HeuristicNames::getSensitiveDataForName(this.(AttrNode).getName())
+      nameIndicatesSensitiveData(this.(AttrNode).getName(), data.getClassification())
     }
 
     override predicate isSourceOf(TaintKind kind) { kind = data }
@@ -149,7 +104,7 @@ module SensitiveData {
       this.(CallNode).getFunction().(AttrNode).getName() = "get" and
       exists(StringValue sensitive |
         this.(CallNode).getAnArg().pointsTo(sensitive) and
-        data = HeuristicNames::getSensitiveDataForName(sensitive.getText())
+        nameIndicatesSensitiveData(sensitive.getText(), data.getClassification())
       )
     }
 
diff --git a/python/ql/src/semmle/python/security/internal/SensitiveDataHeuristics.qll b/python/ql/src/semmle/python/security/internal/SensitiveDataHeuristics.qll