Skip to content

Commit 9d7e773

Browse files
committed
Extract training data:
Implement the new query that selects data for training. For now we include clauses that implement logic that is identical to the old queries. Include a temporary wrapper query that converts the resulting data into the format expected by the endpoint pipeline. Move the small pieces of `ExtractEndpointData` that are still needed into `ExtractEndpointDataTraining.qll`.
1 parent 855edda commit 9d7e773

File tree

3 files changed

+245
-18
lines changed

3 files changed

+245
-18
lines changed

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointCharacteristics.qll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ abstract class EndpointCharacteristic extends string {
4545
EndpointType endpointClass, boolean isPositiveIndicator, float confidence
4646
);
4747

48+
/** Indicators with confidence at or above this threshold are considered to be high-confidence indicators. */
49+
final float getHighConfidenceThreshold() { result = 0.8 }
50+
4851
// The following are some confidence values that are used in practice by the subclasses. They are defined as named
4952
// constants here to make it easier to change them in the future.
5053
final float maximalConfidence() { result = 1.0 }

javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointDataTraining.ql

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,23 +4,8 @@
44
* Extracts training data we can use to train ML models for ML-powered queries.
55
*/
66

7-
import javascript
8-
import ExtractEndpointData as ExtractEndpointData
7+
private import ExtractEndpointDataTraining as ExtractEndpointDataTraining
98

10-
query predicate endpoints(
11-
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
12-
) {
13-
ExtractEndpointData::endpoints(endpoint, queryName, key, value, valueType) and
14-
// only select endpoints that are either Sink or NotASink
15-
ExtractEndpointData::endpoints(endpoint, queryName, "sinkLabel", ["Sink", "NotASink"], "string") and
16-
// do not select endpoints filtered out by end-to-end evaluation
17-
ExtractEndpointData::endpoints(endpoint, queryName, "isExcludedFromEndToEndEvaluation", "false",
18-
"boolean") and
19-
// only select endpoints that can be part of a tainted flow
20-
ExtractEndpointData::endpoints(endpoint, queryName, "isConstantExpression", "false", "boolean")
21-
}
9+
query predicate endpoints = ExtractEndpointDataTraining::reformattedTrainingEndpoints/5;
2210

23-
query predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
24-
endpoints(endpoint, _, _, _, _) and
25-
ExtractEndpointData::tokenFeatures(endpoint, featureName, featureValue)
26-
}
11+
query predicate tokenFeatures = ExtractEndpointDataTraining::tokenFeatures/3;
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
/*
2+
* For internal use only.
3+
*
4+
* Extracts training data we can use to train ML models for ML-powered queries.
5+
*/
6+
7+
import javascript
8+
import experimental.adaptivethreatmodeling.EndpointCharacteristics
9+
import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures
10+
import NoFeaturizationRestrictionsConfig
11+
private import Exclusions as Exclusions
12+
private import Queries
13+
private import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm
14+
private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm
15+
private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm
16+
private import experimental.adaptivethreatmodeling.XssATM as XssAtm
17+
18+
/**
19+
* Gets the set of featureName-featureValue pairs for each endpoint in the training set.
20+
*
21+
* `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint
22+
* `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set
23+
* `featureValue` to the empty string in this case.
24+
*/
25+
query predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
26+
trainingEndpoints(endpoint, _, _) and
27+
(
28+
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
29+
or
30+
// Performance note: this creates a Cartesian product between `endpoint` and `featureName`.
31+
featureName = EndpointFeatures::getASupportedFeatureName() and
32+
not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and
33+
featureValue = ""
34+
)
35+
}
36+
37+
/**
38+
* Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has
39+
* the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training
40+
* set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling
41+
* code.
42+
*
43+
* Params:
44+
* endpoint: The endpoint to include / exclude.
45+
* endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier class
46+
* for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single sink type.
47+
* This gives us the label for the endpoint in the training data.
48+
* characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently
49+
* uses for type balancing.
50+
*
51+
* Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must
52+
* then group together into a list of characteristics.
53+
*/
54+
query predicate trainingEndpoints(
55+
DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic
56+
) {
57+
characteristic.getEndpoints(endpoint) and
58+
// Only consider the source code for the project being analyzed.
59+
exists(endpoint.getFile().getRelativePath()) and
60+
// Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant
61+
// primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples.
62+
// TODO: Experiment with removing this requirement.
63+
not endpoint.asExpr() instanceof ConstantExpr and
64+
// Do not select endpoints filtered out by end-to-end evaluation.
65+
// TODO: Experiment with removing this requirement.
66+
not Exclusions::isFileExcluded(endpoint.getFile()) and
67+
// Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here
68+
// https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77
69+
// TODO: Experiment with removing this requirement.
70+
not (
71+
endpointClass instanceof NegativeType and
72+
exists(EndpointCharacteristic c |
73+
c.getEndpoints(endpoint) and
74+
c instanceof LikelyNotASinkCharacteristic
75+
)
76+
) and
77+
(
78+
// If the list of characteristics includes positive indicators with high confidence for this class, select this as a
79+
// training sample belonging to the class.
80+
exists(EndpointCharacteristic characteristic2, float confidence |
81+
characteristic2.getEndpoints(endpoint) and
82+
characteristic2.getImplications(endpointClass, true, confidence) and
83+
confidence >= characteristic2.getHighConfidenceThreshold()
84+
) and
85+
(
86+
// Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that
87+
// have no high confidence indicators that they are sinks, because this is what was previously done.
88+
// TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high
89+
// confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink.
90+
not endpointClass instanceof NegativeType
91+
or
92+
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
93+
characteristic3.getEndpoints(endpoint) and
94+
characteristic3.getImplications(posClass, true, confidence3) and
95+
confidence3 >= characteristic3.getHighConfidenceThreshold() and
96+
not posClass instanceof NegativeType
97+
)
98+
)
99+
or
100+
// If the list of characteristics includes negative indicators with high confidence for all classes other than 0,
101+
// select this as a training sample of class 0 (this means we had query-specific characteristics to decide this
102+
// endpoint isn’t a sink for each of our sink types).
103+
endpointClass instanceof NegativeType and
104+
forall(EndpointType otherClass | not otherClass instanceof NegativeType |
105+
exists(EndpointCharacteristic characteristic2, float confidence |
106+
characteristic2.getEndpoints(endpoint) and
107+
characteristic2.getImplications(otherClass, false, confidence) and
108+
confidence >= characteristic2.getHighConfidenceThreshold()
109+
)
110+
)
111+
)
112+
}
113+
114+
/**
115+
* Temporary:
116+
* Reformat the training data that was extracted with the new logic to match the format produced by the old predicate.
117+
* This is the format expected by the endpoint pipeline.
118+
*/
119+
query predicate reformattedTrainingEndpoints(
120+
DataFlow::Node endpoint, string queryName, string key, string value, string valueType
121+
) {
122+
trainingEndpoints(endpoint, _, _) and
123+
exists(Query query |
124+
queryName = query.getName() and
125+
// For sinks, only list that sink type, but for non-sinks, list all sink types.
126+
(
127+
exists(EndpointType endpointClass |
128+
endpointClass.getDescription().matches(queryName + "%") and
129+
not endpointClass instanceof NegativeType and
130+
trainingEndpoints(endpoint, endpointClass, _)
131+
)
132+
or
133+
exists(EndpointType endpointClass |
134+
endpointClass instanceof NegativeType and
135+
trainingEndpoints(endpoint, endpointClass, _)
136+
)
137+
) and
138+
(
139+
// NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false.
140+
key = "hasFlowFromSource" and
141+
(
142+
if FlowFromSource::hasFlowFromSource(endpoint, query)
143+
then value = "true"
144+
else value = "false"
145+
) and
146+
valueType = "boolean"
147+
or
148+
// Constant expressions always evaluate to a constant primitive value. Therefore they can't ever
149+
// appear in an alert, making them less interesting training examples.
150+
key = "isConstantExpression" and
151+
(if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and
152+
valueType = "boolean"
153+
or
154+
// Holds if alerts involving the endpoint are excluded from the end-to-end evaluation.
155+
key = "isExcludedFromEndToEndEvaluation" and
156+
(if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and
157+
valueType = "boolean"
158+
or
159+
// The label for this query, considering the endpoint as a sink.
160+
key = "sinkLabel" and
161+
valueType = "string" and
162+
value = "Sink" and
163+
exists(EndpointType endpointClass |
164+
endpointClass.getDescription().matches(queryName + "%") and
165+
not endpointClass instanceof NegativeType and
166+
trainingEndpoints(endpoint, endpointClass, _)
167+
)
168+
or
169+
key = "sinkLabel" and
170+
valueType = "string" and
171+
value = "NotASink" and
172+
exists(EndpointType endpointClass |
173+
endpointClass instanceof NegativeType and
174+
trainingEndpoints(endpoint, endpointClass, _)
175+
)
176+
or
177+
// The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples.
178+
key = "notASinkReason" and
179+
exists(EndpointCharacteristic characteristic, EndpointType endpointClass |
180+
characteristic.getEndpoints(endpoint) and
181+
characteristic.getImplications(endpointClass, true, _) and
182+
endpointClass instanceof NegativeType and
183+
value = characteristic
184+
) and
185+
// Don't include a notASinkReason for endpoints that are also known sinks.
186+
not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass |
187+
characteristic3.getEndpoints(endpoint) and
188+
characteristic3.getImplications(posClass, true, confidence3) and
189+
confidence3 >= characteristic3.getHighConfidenceThreshold() and
190+
not posClass instanceof NegativeType
191+
) and
192+
valueType = "string"
193+
)
194+
)
195+
}
196+
197+
/**
198+
* Gets the ATM data flow configuration for the specified query.
199+
* TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
200+
*/
201+
DataFlow::Configuration getDataFlowCfg(Query query) {
202+
query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration
203+
or
204+
query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration
205+
or
206+
query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration
207+
or
208+
query instanceof XssQuery and result instanceof XssAtm::Configuration
209+
}
210+
211+
// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`.
212+
module FlowFromSource {
213+
predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) {
214+
exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint))
215+
}
216+
217+
/**
218+
* A data flow configuration that replicates the data flow configuration for a specific query, but
219+
* replaces the set of sinks with the set of endpoints we're extracting.
220+
*
221+
* We use this to find out when there is flow to a particular endpoint from a known source.
222+
*
223+
* This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class
224+
* from the CodeQL standard libraries for JavaScript.
225+
*/
226+
private class Configuration extends DataFlow::Configuration {
227+
Query q;
228+
229+
Configuration() { this = getDataFlowCfg(q) }
230+
231+
Query getQuery() { result = q }
232+
233+
/** Holds if `sink` is an endpoint we're extracting. */
234+
override predicate isSink(DataFlow::Node sink) { any() }
235+
236+
/** Holds if `sink` is an endpoint we're extracting. */
237+
override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) }
238+
}
239+
}

0 commit comments

Comments
 (0)