Skip to content

Commit bebf4ca

Browse files
authored
Merge pull request #7357 from github/henrymercer/js-atm-only-featurize-with-flow
JS: Only featurize endpoints that are part of a flow path
2 parents d058d36 + c186453 commit bebf4ca

File tree

2 files changed

+88
-56
lines changed

2 files changed

+88
-56
lines changed

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll

Lines changed: 66 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,59 +6,76 @@
66

77
import javascript
88
import CodeToFeatures
9-
import EndpointScoring
9+
private import EndpointScoring
10+
11+
/**
12+
* A configuration that defines which endpoints should be featurized.
13+
*
14+
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
15+
* to featurize.
16+
*/
17+
abstract class FeaturizationConfig extends string {
18+
bindingset[this]
19+
FeaturizationConfig() { any() }
20+
21+
abstract DataFlow::Node getAnEndpointToFeaturize();
22+
}
1023

1124
/**
1225
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
1326
*
1427
* This is a single string containing a space-separated list of tokens.
1528
*/
1629
private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
17-
// Features for endpoints that are contained within a function.
18-
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
19-
// The name of the function that encloses the endpoint.
20-
featureName = "enclosingFunctionName" and result = entity.getName()
21-
or
22-
// A feature containing natural language tokens from the function that encloses the endpoint in
23-
// the order that they appear in the source code.
24-
featureName = "enclosingFunctionBody" and
25-
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
26-
)
27-
or
28-
result =
29-
strictconcat(DataFlow::CallNode call, string component |
30-
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
31-
|
32-
component, " "
30+
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
31+
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
32+
(
33+
// Features for endpoints that are contained within a function.
34+
exists(DatabaseFeatures::Entity entity | entity = getRepresentativeEntityForEndpoint(endpoint) |
35+
// The name of the function that encloses the endpoint.
36+
featureName = "enclosingFunctionName" and result = entity.getName()
37+
or
38+
// A feature containing natural language tokens from the function that encloses the endpoint in
39+
// the order that they appear in the source code.
40+
featureName = "enclosingFunctionBody" and
41+
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
3342
)
34-
or
35-
// The access path of the function being called, both with and without structural info, if the
36-
// function being called originates from an external API. For example, the endpoint here:
37-
//
38-
// ```js
39-
// const mongoose = require('mongoose'),
40-
// User = mongoose.model('User', null);
41-
// User.findOne(ENDPOINT);
42-
// ```
43-
//
44-
// would have a callee access path with structural info of
45-
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
46-
// path without structural info of `mongoose model findOne`.
47-
//
48-
// These features indicate that the callee comes from (reading the access path backwards) an
49-
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
50-
// external library.
51-
exists(AccessPaths::Boolean includeStructuralInfo |
52-
featureName =
53-
"calleeAccessPath" +
54-
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
43+
or
5544
result =
56-
concat(API::Node node, string accessPath |
57-
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
58-
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
45+
strictconcat(DataFlow::CallNode call, string component |
46+
component = getACallBasedTokenFeatureComponent(endpoint, call, featureName)
5947
|
60-
accessPath, " "
48+
component, " "
6149
)
50+
or
51+
// The access path of the function being called, both with and without structural info, if the
52+
// function being called originates from an external API. For example, the endpoint here:
53+
//
54+
// ```js
55+
// const mongoose = require('mongoose'),
56+
// User = mongoose.model('User', null);
57+
// User.findOne(ENDPOINT);
58+
// ```
59+
//
60+
// would have a callee access path with structural info of
61+
// `mongoose member model instanceorreturn member findOne instanceorreturn`, and a callee access
62+
// path without structural info of `mongoose model findOne`.
63+
//
64+
// These features indicate that the callee comes from (reading the access path backwards) an
65+
// instance of the `findOne` member of an instance of the `model` member of the `mongoose`
66+
// external library.
67+
exists(AccessPaths::Boolean includeStructuralInfo |
68+
featureName =
69+
"calleeAccessPath" +
70+
any(string x | if includeStructuralInfo = true then x = "WithStructuralInfo" else x = "") and
71+
result =
72+
concat(API::Node node, string accessPath |
73+
node.getInducingNode().(DataFlow::CallNode).getAnArgument() = endpoint and
74+
AccessPaths::accessPaths(node, includeStructuralInfo, accessPath, _)
75+
|
76+
accessPath, " "
77+
)
78+
)
6279
)
6380
}
6481

@@ -77,6 +94,8 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
7794
private string getACallBasedTokenFeatureComponent(
7895
DataFlow::Node endpoint, DataFlow::CallNode call, string featureName
7996
) {
97+
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
98+
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
8099
// Features for endpoints that are an argument to a function call.
81100
endpoint = call.getAnArgument() and
82101
(
@@ -111,6 +130,9 @@ private string getACallBasedTokenFeatureComponent(
111130
module FunctionBodies {
112131
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
113132
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
133+
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
134+
entity =
135+
getRepresentativeEntityForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
114136
exists(DatabaseFeatures::AstNode node |
115137
DatabaseFeatures::astNodes(entity, _, _, node, _) and
116138
token = unique(string t | DatabaseFeatures::nodeAttributes(node, t)) and
@@ -276,7 +298,8 @@ private string getASupportedFeatureName() {
276298
* `featureValue` for the endpoint `endpoint`.
277299
*/
278300
predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) {
279-
ModelScoring::endpoints(endpoint) and
301+
// Performance optimization: Restrict feature extraction to endpoints we've explicitly asked to featurize.
302+
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
280303
(
281304
if strictcount(getTokenFeature(endpoint, featureName)) = 1
282305
then featureValue = getTokenFeature(endpoint, featureName)

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -80,23 +80,30 @@ DatabaseFeatures::Entity getRepresentativeEntityForEndpoint(DataFlow::Node endpo
8080
}
8181

8282
module ModelScoring {
83-
predicate endpoints(DataFlow::Node endpoint) {
84-
getCfg().isEffectiveSource(endpoint) or
85-
getCfg().isEffectiveSink(endpoint)
83+
/**
84+
* A featurization config that only featurizes new candidate endpoints that are part of a flow
85+
* path.
86+
*/
87+
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
88+
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
89+
90+
override DataFlow::Node getAnEndpointToFeaturize() {
91+
getCfg().isEffectiveSource(result) and any(DataFlow::Configuration cfg).hasFlow(result, _)
92+
or
93+
getCfg().isEffectiveSink(result) and any(DataFlow::Configuration cfg).hasFlow(_, result)
94+
}
8695
}
8796

88-
private int requestedEndpointTypes() { result = any(EndpointType type).getEncoding() }
89-
90-
private predicate relevantTokenFeatures(
91-
DataFlow::Node endpoint, string featureName, string featureValue
92-
) {
93-
endpoints(endpoint) and
94-
EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue)
97+
DataFlow::Node getARequestedEndpoint() {
98+
result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize()
9599
}
96100

101+
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
102+
97103
predicate endpointScores(DataFlow::Node endpoint, int encodedEndpointType, float score) =
98-
scoreEndpoints(endpoints/1, requestedEndpointTypes/0, relevantTokenFeatures/3,
99-
getACompatibleModelChecksum/0)(endpoint, encodedEndpointType, score)
104+
scoreEndpoints(getARequestedEndpoint/0, getARequestedEndpointType/0,
105+
EndpointFeatures::tokenFeatures/3, getACompatibleModelChecksum/0)(endpoint,
106+
encodedEndpointType, score)
100107
}
101108

102109
/**
@@ -212,7 +219,9 @@ class EndpointScoringResults extends ScoringResults {
212219
}
213220

214221
module Debugging {
215-
query predicate hopInputEndpoints = ModelScoring::endpoints/1;
222+
query predicate hopInputEndpoints(DataFlow::Node endpoint) {
223+
endpoint = ModelScoring::getARequestedEndpoint()
224+
}
216225

217226
query predicate endpointScores = ModelScoring::endpointScores/3;
218227

0 commit comments

Comments
 (0)