cklin
diff --git a/‎.codeqlmanifest.json
Lines changed: 2 additions & 0 deletions b/‎.codeqlmanifest.json
Lines changed: 2 additions & 0 deletions
diff --git a/‎javascript/ql/experimental/adaptivethreatmodeling/README.md
Lines changed: 6 additions & 0 deletions b/‎javascript/ql/experimental/adaptivethreatmodeling/README.md
Lines changed: 6 additions & 0 deletions
diff --git a/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll
Lines changed: 112 additions & 0 deletions b/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/ATMConfig.qll
Lines changed: 112 additions & 0 deletions
diff --git a/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/AdaptiveThreatModeling.qll
Lines changed: 125 additions & 0 deletions b/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/AdaptiveThreatModeling.qll
Lines changed: 125 additions & 0 deletions
diff --git a/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
Lines changed: 121 additions & 0 deletions b/‎javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
Lines changed: 121 additions & 0 deletions
@@ -4,5 +4,7 @@
                "cpp/ql/test/query-tests/Security/CWE/CWE-190/semmle/tainted/qlpack.yml",
                "*/ql/examples/qlpack.yml",
                "*/upgrades/qlpack.yml",
+               "javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml",
+               "javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml",
                "misc/legacy-support/*/qlpack.yml",
                "misc/suite-helpers/qlpack.yml" ] }
@@ -0,0 +1,6 @@
+# [Internal only] Adaptive Threat Modeling for JavaScript
+
+This directory contains CodeQL libraries and queries that power adaptive threat modeling for JavaScript.
+All APIs are experimental and may change in the future.
+
+These queries can only be run by internal users; for external users they will return no results.
@@ -0,0 +1,112 @@
+/*
+ * For internal use only.
+ *
+ * Configures boosting for adaptive threat modeling (ATM).
+ */
+
+private import javascript as raw
+import EndpointTypes
+
+/**
+ * EXPERIMENTAL. This API may change in the future.
+ *
+ * A configuration class for defining known endpoints and endpoint filters for adaptive threat
+ * modeling (ATM). Each boosted query must define its own extension of this abstract class.
+ *
+ * A configuration defines a set of known sources (`isKnownSource`) and sinks (`isKnownSink`).
+ * It must also define a sink endpoint filter (`isEffectiveSink`) that filters candidate sinks
+ * predicted by the machine learning model to a set of effective sinks.
+ *
+ * To get started with ATM, you can copy-paste an implementation of the relevant predicates from a
+ * `DataFlow::Configuration` or `TaintTracking::Configuration` class for a standard security query.
+ * For example, for SQL injection you can start by defining the `isKnownSource` and `isKnownSink`
+ * predicates in the ATM configuration by copying and pasting the implementations of `isSource` and
+ * `isSink` from `SqlInjection::Configuration`.
+ *
+ * Note that if the security query configuration defines additional edges beyond the standard data
+ * flow edges, such as `NosqlInjection::Configuration`, you may need to replace the definition of
+ * `isAdditionalFlowStep` with a more generalised definition of additional edges. See
+ * `NosqlInjectionATM.qll` for an example of doing this.
+ */
+abstract class ATMConfig extends string {
+  bindingset[this]
+  ATMConfig() { any() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Holds if `source` is a known source of flow.
+   */
+  predicate isKnownSource(raw::DataFlow::Node source) { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Holds if `sink` is a known sink of flow.
+   */
+  predicate isKnownSink(raw::DataFlow::Node sink) { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Holds if the candidate source `candidateSource` predicted by the machine learning model should be
+   * an effective source, i.e. one considered as a possible source of flow in the boosted query.
+   */
+  predicate isEffectiveSource(raw::DataFlow::Node candidateSource) { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
+   * an effective sink, i.e. one considered as a possible sink of flow in the boosted query.
+   */
+  predicate isEffectiveSink(raw::DataFlow::Node candidateSink) { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
+   * an effective sink that overrides the score provided by the machine learning model with the
+   * score `score` for reason `why`. The effective sinks identified by this predicate MUST be a
+   * subset of those identified by the `isEffectiveSink` predicate.
+   *
+   * For example, in the ATM external API query, we use this method to ensure the ATM external API
+   * query produces the same results as the standard external API query, but assigns flows
+   * involving sinks that are filtered out by the endpoint filters a score of 0.
+   *
+   * This predicate can be phased out once we no longer need to rely on predicates like
+   * `paddedScore` in the ATM CodeQL libraries to add scores to alert messages in a way that works
+   * with lexical sort orders.
+   */
+  predicate isEffectiveSinkWithOverridingScore(
+    raw::DataFlow::Node candidateSink, float score, string why
+  ) {
+    none()
+  }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Get an endpoint type for the sources of this query. A query may have multiple applicable
+   * endpoint types for its sources.
+   */
+  EndpointType getASourceEndpointType() { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Get an endpoint type for the sinks of this query. A query may have multiple applicable
+   * endpoint types for its sinks.
+   */
+  EndpointType getASinkEndpointType() { none() }
+
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * Specifies the default cut-off value that controls how many alerts are produced.
+   * The cut-off value must be in the range [0,1].
+   * A cut-off value of 0 only produces alerts that are likely true-positives.
+   * A cut-off value of 1 produces all alerts including those that are likely false-positives.
+   */
+  float getScoreCutoff() { result = 0.0 }
+}
@@ -0,0 +1,125 @@
+/*
+ * For internal use only.
+ *
+ * Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).
+ */
+
+private import javascript as raw
+private import raw::DataFlow as DataFlow
+import ATMConfig
+private import BaseScoring
+private import EndpointScoring as EndpointScoring
+
+module ATM {
+  /**
+   * EXPERIMENTAL. This API may change in the future.
+   *
+   * This module contains informational predicates about the results returned by adaptive threat
+   * modeling (ATM).
+   */
+  module ResultsInfo {
+    /**
+     * Indicates whether the flow from source to sink represents a result with
+     * sufficiently high likelihood of being a true-positive.
+     */
+    pragma[inline]
+    private predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
+      any(ScoringResults results).shouldResultBeIncluded(source, sink)
+    }
+
+    /**
+     * EXPERIMENTAL. This API may change in the future.
+     *
+     * Returns the score for the flow between the source `source` and the `sink` sink in the
+     * boosted query.
+     */
+    pragma[inline]
+    float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
+      any(DataFlow::Configuration cfg).hasFlow(source, sink) and
+      shouldResultBeIncluded(source, sink) and
+      result = unique(float s | s = any(ScoringResults results).getScoreForFlow(source, sink))
+    }
+
+    /**
+     * Pad a score returned from `getKnownScoreForFlow` to a particular length by adding a decimal
+     * point if one does not already exist, and "0"s after that decimal point.
+     *
+     * Note that this predicate must itself define an upper bound on `length`, so that it has a
+     * finite number of results. Currently this is defined as 12.
+     */
+    private string paddedScore(float score, int length) {
+      // In this definition, we must restrict the values that `length` and `score` can take on so
+      // that the predicate has a finite number of results.
+      (score = getScoreForFlow(_, _) or score = 0) and
+      length = result.length() and
+      (
+        // We need to make sure the padded score contains a "." so lexically sorting the padded
+        // scores is equivalent to numerically sorting the scores.
+        score.toString().charAt(_) = "." and
+        result = score.toString()
+        or
+        not score.toString().charAt(_) = "." and
+        result = score.toString() + "."
+      )
+      or
+      result = paddedScore(score, length - 1) + "0" and
+      length <= 12
+    }
+
+    /**
+     * EXPERIMENTAL. This API may change in the future.
+     *
+     * Return a string representing the score of the flow between `source` and `sink` in the
+     * boosted query.
+     *
+     * The returned string is a fixed length, such that lexically sorting the strings returned by
+     * this predicate gives the same sort order as numerically sorting the scores of the flows.
+     */
+    pragma[inline]
+    string getScoreStringForFlow(DataFlow::Node source, DataFlow::Node sink) {
+      exists(float score |
+        score = getScoreForFlow(source, sink) and
+        (
+          // A length of 12 is equivalent to 10 decimal places.
+          score.toString().length() >= 12 and
+          result = score.toString().substring(0, 12)
+          or
+          score.toString().length() < 12 and
+          result = paddedScore(score, 12)
+        )
+      )
+    }
+
+    /**
+     * EXPERIMENTAL. This API may change in the future.
+     *
+     * Indicates whether the flow from source to sink is likely to be reported by the base security
+     * query.
+     *
+     * Currently this is a heuristic: it ignores potential differences in the definitions of
+     * additional flow steps.
+     */
+    pragma[inline]
+    predicate isFlowLikelyInBaseQuery(DataFlow::Node source, DataFlow::Node sink) {
+      getCfg().isKnownSource(source) and getCfg().isKnownSink(sink)
+    }
+
+    /**
+     * EXPERIMENTAL. This API may change in the future.
+     *
+     * Get additional information about why ATM included the flow from source to sink as an alert.
+     */
+    pragma[inline]
+    string getAdditionalAlertInfo(DataFlow::Node source, DataFlow::Node sink) {
+      exists(string sourceOrigins, string sinkOrigins |
+        sourceOrigins = concat(any(ScoringResults results).getASourceOrigin(source), ", ") and
+        sinkOrigins = concat(any(ScoringResults results).getASinkOrigin(sink), ", ") and
+        result =
+          "[Source origins: " +
+            any(string s | if sourceOrigins != "" then s = sourceOrigins else s = "unknown") +
+            "; sink origins: " +
+            any(string s | if sinkOrigins != "" then s = sinkOrigins else s = "unknown") + "]"
+      )
+    }
+  }
+}
@@ -0,0 +1,121 @@
+/*
+ * For internal use only.
+ *
+ * Provides shared scoring functionality for use in adaptive threat modeling (ATM).
+ */
+
+private import javascript
+private import ATMConfig
+
+external predicate adaptiveThreatModelingModels(
+  string modelChecksum, string modelLanguage, string modelName, string modelType
+);
+
+/** Get the ATM configuration. */
+ATMConfig getCfg() { any() }
+
+/**
+ * This module provides functionality that takes an endpoint and provides an entity that encloses that
+ * endpoint and is suitable for similarity analysis.
+ */
+module EndpointToEntity {
+  private import CodeToFeatures
+
+  /**
+   * Get an entity enclosing the endpoint that is suitable for similarity analysis. In general,
+   * this may associate multiple entities to a single endpoint.
+   */
+  DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) {
+    DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and
+    result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*()
+  }
+}
+
+/**
+ * This module provides functionality that takes an entity and provides effective endpoints within
+ * that entity.
+ *
+ * We use the following terminology to describe endpoints:
+ *
+ * - The *candidate* endpoints are the set of data flow nodes that should be passed to the
+ *   appropriate endpoint filter to produce the set of effective endpoints.
+ *   When we have a model that beats the performance of the baseline, we will likely define the
+ *   candidate endpoints based on the most confident predictions of the model.
+ * - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter.
+ *   In other words, it is a candidate endpoint for which the `isEffectiveSink` (or
+ *   `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds.
+ */
+module EntityToEffectiveEndpoint {
+  private import CodeToFeatures
+
+  /**
+   * Returns endpoint candidates within the specified entities.
+   *
+   * The baseline implementation of this is that a candidate endpoint is any data flow node that is
+   * enclosed within the specified entity.
+   */
+  private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) {
+    result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction()
+  }
+
+  /**
+   * Get an effective source enclosed by the specified entity.
+   *
+   * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
+   * source may occur in a function defined within the specified entity.
+   */
+  DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) {
+    result = getABaselineEndpointCandidate(entity) and
+    getCfg().isEffectiveSource(result)
+  }
+
+  /**
+   * Get an effective sink enclosed by the specified entity.
+   *
+   * N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
+   * sink may occur in a function defined within the specified entity.
+   */
+  DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) {
+    result = getABaselineEndpointCandidate(entity) and
+    getCfg().isEffectiveSink(result)
+  }
+}
+
+/**
+ * Scoring information produced by a scoring model.
+ *
+ * Scoring models include embedding models and endpoint scoring models.
+ */
+abstract class ScoringResults extends string {
+  bindingset[this]
+  ScoringResults() { any() }
+
+  /**
+   * Get ATM's confidence that a path between `source` and `sink` represents a security
+   * vulnerability. This will be a number between 0.0 and 1.0.
+   */
+  abstract float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink);
+
+  /**
+   * Get a string representing why ATM included the given source in the dataflow analysis.
+   *
+   * In general, there may be multiple reasons why ATM included the given source, in which case
+   * this predicate should have multiple results.
+   */
+  abstract string getASourceOrigin(DataFlow::Node source);
+
+  /**
+   * Get a string representing why ATM included the given sink in the dataflow analysis.
+   *
+   * In general, there may be multiple reasons why ATM included the given sink, in which case this
+   * predicate should have multiple results.
+   */
+  abstract string getASinkOrigin(DataFlow::Node sink);
+
+  /**
+   * Indicates whether the flow from source to sink represents a result with
+   * sufficiently high likelihood of being a true-positive.
+   */
+  pragma[inline]
+  abstract predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink);
+}