Skip to content

Commit 4d7a828

Browse files
committed
JS: Initial commit of Adaptive Threat Modeling
1 parent 4b069d4 commit 4d7a828

26 files changed

+2569
-0
lines changed

.codeqlmanifest.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,7 @@
44
"cpp/ql/test/query-tests/Security/CWE/CWE-190/semmle/tainted/qlpack.yml",
55
"*/ql/examples/qlpack.yml",
66
"*/upgrades/qlpack.yml",
7+
"javascript/ql/experimental/adaptivethreatmodeling/lib/qlpack.yml",
8+
"javascript/ql/experimental/adaptivethreatmodeling/src/qlpack.yml",
79
"misc/legacy-support/*/qlpack.yml",
810
"misc/suite-helpers/qlpack.yml" ] }
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# [Internal only] Adaptive Threat Modeling for JavaScript
2+
3+
This directory contains CodeQL libraries and queries that power adaptive threat modeling for JavaScript.
4+
All APIs are experimental and may change in the future.
5+
6+
These queries can only be run by internal users; for external users they will return no results.
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
* For internal use only.
3+
*
4+
* Configures boosting for adaptive threat modeling (ATM).
5+
*/
6+
7+
private import javascript as raw
8+
import EndpointTypes
9+
10+
/**
11+
* EXPERIMENTAL. This API may change in the future.
12+
*
13+
* A configuration class for defining known endpoints and endpoint filters for adaptive threat
14+
* modeling (ATM). Each boosted query must define its own extension of this abstract class.
15+
*
16+
* A configuration defines a set of known sources (`isKnownSource`) and sinks (`isKnownSink`).
17+
* It must also define a sink endpoint filter (`isEffectiveSink`) that filters candidate sinks
18+
* predicted by the machine learning model to a set of effective sinks.
19+
*
20+
* To get started with ATM, you can copy-paste an implementation of the relevant predicates from a
21+
* `DataFlow::Configuration` or `TaintTracking::Configuration` class for a standard security query.
22+
* For example, for SQL injection you can start by defining the `isKnownSource` and `isKnownSink`
23+
* predicates in the ATM configuration by copying and pasting the implementations of `isSource` and
24+
* `isSink` from `SqlInjection::Configuration`.
25+
*
26+
* Note that if the security query configuration defines additional edges beyond the standard data
27+
* flow edges, such as `NosqlInjection::Configuration`, you may need to replace the definition of
28+
* `isAdditionalFlowStep` with a more generalised definition of additional edges. See
29+
* `NosqlInjectionATM.qll` for an example of doing this.
30+
*/
31+
abstract class ATMConfig extends string {
32+
bindingset[this]
33+
ATMConfig() { any() }
34+
35+
/**
36+
* EXPERIMENTAL. This API may change in the future.
37+
*
38+
* Holds if `source` is a known source of flow.
39+
*/
40+
predicate isKnownSource(raw::DataFlow::Node source) { none() }
41+
42+
/**
43+
* EXPERIMENTAL. This API may change in the future.
44+
*
45+
* Holds if `sink` is a known sink of flow.
46+
*/
47+
predicate isKnownSink(raw::DataFlow::Node sink) { none() }
48+
49+
/**
50+
* EXPERIMENTAL. This API may change in the future.
51+
*
52+
* Holds if the candidate source `candidateSource` predicted by the machine learning model should be
53+
* an effective source, i.e. one considered as a possible source of flow in the boosted query.
54+
*/
55+
predicate isEffectiveSource(raw::DataFlow::Node candidateSource) { none() }
56+
57+
/**
58+
* EXPERIMENTAL. This API may change in the future.
59+
*
60+
* Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
61+
* an effective sink, i.e. one considered as a possible sink of flow in the boosted query.
62+
*/
63+
predicate isEffectiveSink(raw::DataFlow::Node candidateSink) { none() }
64+
65+
/**
66+
* EXPERIMENTAL. This API may change in the future.
67+
*
68+
* Holds if the candidate sink `candidateSink` predicted by the machine learning model should be
69+
* an effective sink that overrides the score provided by the machine learning model with the
70+
* score `score` for reason `why`. The effective sinks identified by this predicate MUST be a
71+
* subset of those identified by the `isEffectiveSink` predicate.
72+
*
73+
* For example, in the ATM external API query, we use this method to ensure the ATM external API
74+
* query produces the same results as the standard external API query, but assigns flows
75+
* involving sinks that are filtered out by the endpoint filters a score of 0.
76+
*
77+
* This predicate can be phased out once we no longer need to rely on predicates like
78+
* `paddedScore` in the ATM CodeQL libraries to add scores to alert messages in a way that works
79+
* with lexical sort orders.
80+
*/
81+
predicate isEffectiveSinkWithOverridingScore(
82+
raw::DataFlow::Node candidateSink, float score, string why
83+
) {
84+
none()
85+
}
86+
87+
/**
88+
* EXPERIMENTAL. This API may change in the future.
89+
*
90+
* Get an endpoint type for the sources of this query. A query may have multiple applicable
91+
* endpoint types for its sources.
92+
*/
93+
EndpointType getASourceEndpointType() { none() }
94+
95+
/**
96+
* EXPERIMENTAL. This API may change in the future.
97+
*
98+
* Get an endpoint type for the sinks of this query. A query may have multiple applicable
99+
* endpoint types for its sinks.
100+
*/
101+
EndpointType getASinkEndpointType() { none() }
102+
103+
/**
104+
* EXPERIMENTAL. This API may change in the future.
105+
*
106+
* Specifies the default cut-off value that controls how many alerts are produced.
107+
* The cut-off value must be in the range [0,1].
108+
* A cut-off value of 0 only produces alerts that are likely true-positives.
109+
* A cut-off value of 1 produces all alerts including those that are likely false-positives.
110+
*/
111+
float getScoreCutoff() { result = 0.0 }
112+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
/*
2+
* For internal use only.
3+
*
4+
* Provides information about the results of boosted queries for use in adaptive threat modeling (ATM).
5+
*/
6+
7+
private import javascript as raw
8+
private import raw::DataFlow as DataFlow
9+
import ATMConfig
10+
private import BaseScoring
11+
private import EndpointScoring as EndpointScoring
12+
13+
module ATM {
14+
/**
15+
* EXPERIMENTAL. This API may change in the future.
16+
*
17+
* This module contains informational predicates about the results returned by adaptive threat
18+
* modeling (ATM).
19+
*/
20+
module ResultsInfo {
21+
/**
22+
* Indicates whether the flow from source to sink represents a result with
23+
* sufficiently high likelihood of being a true-positive.
24+
*/
25+
pragma[inline]
26+
private predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink) {
27+
any(ScoringResults results).shouldResultBeIncluded(source, sink)
28+
}
29+
30+
/**
31+
* EXPERIMENTAL. This API may change in the future.
32+
*
33+
* Returns the score for the flow between the source `source` and the `sink` sink in the
34+
* boosted query.
35+
*/
36+
pragma[inline]
37+
float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink) {
38+
any(DataFlow::Configuration cfg).hasFlow(source, sink) and
39+
shouldResultBeIncluded(source, sink) and
40+
result = unique(float s | s = any(ScoringResults results).getScoreForFlow(source, sink))
41+
}
42+
43+
/**
44+
* Pad a score returned from `getKnownScoreForFlow` to a particular length by adding a decimal
45+
* point if one does not already exist, and "0"s after that decimal point.
46+
*
47+
* Note that this predicate must itself define an upper bound on `length`, so that it has a
48+
* finite number of results. Currently this is defined as 12.
49+
*/
50+
private string paddedScore(float score, int length) {
51+
// In this definition, we must restrict the values that `length` and `score` can take on so
52+
// that the predicate has a finite number of results.
53+
(score = getScoreForFlow(_, _) or score = 0) and
54+
length = result.length() and
55+
(
56+
// We need to make sure the padded score contains a "." so lexically sorting the padded
57+
// scores is equivalent to numerically sorting the scores.
58+
score.toString().charAt(_) = "." and
59+
result = score.toString()
60+
or
61+
not score.toString().charAt(_) = "." and
62+
result = score.toString() + "."
63+
)
64+
or
65+
result = paddedScore(score, length - 1) + "0" and
66+
length <= 12
67+
}
68+
69+
/**
70+
* EXPERIMENTAL. This API may change in the future.
71+
*
72+
* Return a string representing the score of the flow between `source` and `sink` in the
73+
* boosted query.
74+
*
75+
* The returned string is a fixed length, such that lexically sorting the strings returned by
76+
* this predicate gives the same sort order as numerically sorting the scores of the flows.
77+
*/
78+
pragma[inline]
79+
string getScoreStringForFlow(DataFlow::Node source, DataFlow::Node sink) {
80+
exists(float score |
81+
score = getScoreForFlow(source, sink) and
82+
(
83+
// A length of 12 is equivalent to 10 decimal places.
84+
score.toString().length() >= 12 and
85+
result = score.toString().substring(0, 12)
86+
or
87+
score.toString().length() < 12 and
88+
result = paddedScore(score, 12)
89+
)
90+
)
91+
}
92+
93+
/**
94+
* EXPERIMENTAL. This API may change in the future.
95+
*
96+
* Indicates whether the flow from source to sink is likely to be reported by the base security
97+
* query.
98+
*
99+
* Currently this is a heuristic: it ignores potential differences in the definitions of
100+
* additional flow steps.
101+
*/
102+
pragma[inline]
103+
predicate isFlowLikelyInBaseQuery(DataFlow::Node source, DataFlow::Node sink) {
104+
getCfg().isKnownSource(source) and getCfg().isKnownSink(sink)
105+
}
106+
107+
/**
108+
* EXPERIMENTAL. This API may change in the future.
109+
*
110+
* Get additional information about why ATM included the flow from source to sink as an alert.
111+
*/
112+
pragma[inline]
113+
string getAdditionalAlertInfo(DataFlow::Node source, DataFlow::Node sink) {
114+
exists(string sourceOrigins, string sinkOrigins |
115+
sourceOrigins = concat(any(ScoringResults results).getASourceOrigin(source), ", ") and
116+
sinkOrigins = concat(any(ScoringResults results).getASinkOrigin(sink), ", ") and
117+
result =
118+
"[Source origins: " +
119+
any(string s | if sourceOrigins != "" then s = sourceOrigins else s = "unknown") +
120+
"; sink origins: " +
121+
any(string s | if sinkOrigins != "" then s = sinkOrigins else s = "unknown") + "]"
122+
)
123+
}
124+
}
125+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
/*
2+
* For internal use only.
3+
*
4+
* Provides shared scoring functionality for use in adaptive threat modeling (ATM).
5+
*/
6+
7+
private import javascript
8+
private import ATMConfig
9+
10+
external predicate adaptiveThreatModelingModels(
11+
string modelChecksum, string modelLanguage, string modelName, string modelType
12+
);
13+
14+
/** Get the ATM configuration. */
15+
ATMConfig getCfg() { any() }
16+
17+
/**
18+
* This module provides functionality that takes an endpoint and provides an entity that encloses that
19+
* endpoint and is suitable for similarity analysis.
20+
*/
21+
module EndpointToEntity {
22+
private import CodeToFeatures
23+
24+
/**
25+
* Get an entity enclosing the endpoint that is suitable for similarity analysis. In general,
26+
* this may associate multiple entities to a single endpoint.
27+
*/
28+
DatabaseFeatures::Entity getAnEntityForEndpoint(DataFlow::Node endpoint) {
29+
DatabaseFeatures::entities(result, _, _, _, _, _, _, _, _) and
30+
result.getDefinedFunction() = endpoint.getContainer().getEnclosingContainer*()
31+
}
32+
}
33+
34+
/**
35+
* This module provides functionality that takes an entity and provides effective endpoints within
36+
* that entity.
37+
*
38+
* We use the following terminology to describe endpoints:
39+
*
40+
* - The *candidate* endpoints are the set of data flow nodes that should be passed to the
41+
* appropriate endpoint filter to produce the set of effective endpoints.
42+
* When we have a model that beats the performance of the baseline, we will likely define the
43+
* candidate endpoints based on the most confident predictions of the model.
44+
* - An *effective* endpoint is a candidate endpoint which passes through the endpoint filter.
45+
* In other words, it is a candidate endpoint for which the `isEffectiveSink` (or
46+
* `isEffectiveSource`) predicate defined in the `ATMConfig` instance in scope holds.
47+
*/
48+
module EntityToEffectiveEndpoint {
49+
private import CodeToFeatures
50+
51+
/**
52+
* Returns endpoint candidates within the specified entities.
53+
*
54+
* The baseline implementation of this is that a candidate endpoint is any data flow node that is
55+
* enclosed within the specified entity.
56+
*/
57+
private DataFlow::Node getABaselineEndpointCandidate(DatabaseFeatures::Entity entity) {
58+
result.getContainer().getEnclosingContainer*() = entity.getDefinedFunction()
59+
}
60+
61+
/**
62+
* Get an effective source enclosed by the specified entity.
63+
*
64+
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
65+
* source may occur in a function defined within the specified entity.
66+
*/
67+
DataFlow::Node getAnEffectiveSource(DatabaseFeatures::Entity entity) {
68+
result = getABaselineEndpointCandidate(entity) and
69+
getCfg().isEffectiveSource(result)
70+
}
71+
72+
/**
73+
* Get an effective sink enclosed by the specified entity.
74+
*
75+
* N.B. This is _not_ an inverse of `EndpointToEntity::getAnEntityForEndpoint`: the effective
76+
* sink may occur in a function defined within the specified entity.
77+
*/
78+
DataFlow::Node getAnEffectiveSink(DatabaseFeatures::Entity entity) {
79+
result = getABaselineEndpointCandidate(entity) and
80+
getCfg().isEffectiveSink(result)
81+
}
82+
}
83+
84+
/**
85+
* Scoring information produced by a scoring model.
86+
*
87+
* Scoring models include embedding models and endpoint scoring models.
88+
*/
89+
abstract class ScoringResults extends string {
90+
bindingset[this]
91+
ScoringResults() { any() }
92+
93+
/**
94+
* Get ATM's confidence that a path between `source` and `sink` represents a security
95+
* vulnerability. This will be a number between 0.0 and 1.0.
96+
*/
97+
abstract float getScoreForFlow(DataFlow::Node source, DataFlow::Node sink);
98+
99+
/**
100+
* Get a string representing why ATM included the given source in the dataflow analysis.
101+
*
102+
* In general, there may be multiple reasons why ATM included the given source, in which case
103+
* this predicate should have multiple results.
104+
*/
105+
abstract string getASourceOrigin(DataFlow::Node source);
106+
107+
/**
108+
* Get a string representing why ATM included the given sink in the dataflow analysis.
109+
*
110+
* In general, there may be multiple reasons why ATM included the given sink, in which case this
111+
* predicate should have multiple results.
112+
*/
113+
abstract string getASinkOrigin(DataFlow::Node sink);
114+
115+
/**
116+
* Indicates whether the flow from source to sink represents a result with
117+
* sufficiently high likelihood of being a true-positive.
118+
*/
119+
pragma[inline]
120+
abstract predicate shouldResultBeIncluded(DataFlow::Node source, DataFlow::Node sink);
121+
}

0 commit comments

Comments
 (0)