Skip to content

Commit 9e50ce8

Browse files
committed
Move function body features into their own file
1 parent 865fb5d commit 9e50ce8

File tree

5 files changed

+161
-171
lines changed

5 files changed

+161
-171
lines changed

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,22 +14,6 @@ external predicate availableMlModels(
1414
/** Get the ATM configuration. */
1515
ATMConfig getCfg() { any() }
1616

17-
/**
18-
* This module provides functionality that takes an endpoint and provides an function that encloses
19-
* that endpoint.
20-
*/
21-
module EndpointToFunction {
22-
private import CodeToFeatures
23-
24-
/**
25-
* Get a function containing the endpoint that is suitable for featurization. In general,
26-
* this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
27-
*/
28-
Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
29-
result = endpoint.getContainer().getEnclosingContainer*()
30-
}
31-
}
32-
3317
/**
3418
* Scoring information produced by a scoring model.
3519
*

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll

Lines changed: 14 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,9 @@
55
*/
66

77
import javascript
8-
import CodeToFeatures
9-
private import EndpointScoring
10-
11-
/**
12-
* A configuration that defines which endpoints should be featurized.
13-
*
14-
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
15-
* to featurize.
16-
*/
17-
abstract class FeaturizationConfig extends string {
18-
bindingset[this]
19-
FeaturizationConfig() { any() }
20-
21-
abstract DataFlow::Node getAnEndpointToFeaturize();
22-
}
8+
private import CodeToFeatures
9+
private import FeaturizationConfig
10+
private import FunctionBodyFeatures as FunctionBodyFeatures
2311

2412
/**
2513
* Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -32,15 +20,24 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
3220
(
3321
// Features for endpoints that are contained within a function.
3422
exists(DatabaseFeatures::Entity entity |
35-
entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint)
23+
entity.getDefinedFunction() =
24+
FunctionBodyFeatures::getRepresentativeFunctionForEndpoint(endpoint)
3625
|
3726
// The name of the function that encloses the endpoint.
3827
featureName = "enclosingFunctionName" and result = entity.getName()
3928
or
4029
// A feature containing natural language tokens from the function that encloses the endpoint in
4130
// the order that they appear in the source code.
4231
featureName = "enclosingFunctionBody" and
43-
result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
32+
result =
33+
strictconcat(string token, Location l |
34+
FunctionBodyFeatures::bodyTokens(entity, l, token)
35+
|
36+
token, " "
37+
order by
38+
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
39+
l.getEndColumn(), token
40+
)
4441
)
4542
or
4643
result =
@@ -128,71 +125,6 @@ private string getACallBasedTokenFeatureComponent(
128125
)
129126
}
130127

131-
/** This module provides functionality for getting the function body feature associated with a particular entity. */
132-
module FunctionBodies {
133-
string getTokenizedAstNode(ASTNode node) {
134-
// NB: Unary and binary operator expressions e.g. -a, a + b and compound
135-
// assignments e.g. a += b can be identified by the expression type.
136-
result = node.(Identifier).getName()
137-
or
138-
// Computed property accesses for which we can predetermine the property being accessed.
139-
// NB: May alias with operators e.g. could have '+' as a property name.
140-
result = node.(IndexExpr).getPropertyName()
141-
or
142-
// We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
143-
result = node.(NumberLiteral).getRawValue()
144-
or
145-
// We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
146-
not node instanceof NumberLiteral and
147-
result = node.(Literal).getValue()
148-
or
149-
result = node.(TemplateElement).getRawValue()
150-
}
151-
152-
/** Returns an AST node within the function `f` that we should featurize. */
153-
pragma[inline]
154-
ASTNode getAnASTNodeToFeaturize(Function f) {
155-
result.getParent*() = f and
156-
not result = f.getIdentifier() and
157-
exists(getTokenizedAstNode(result))
158-
}
159-
160-
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
161-
private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
162-
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
163-
entity.getDefinedFunction() =
164-
getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
165-
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
166-
// approximates the behavior of the classifer on non-generic body features where large body
167-
// features are replaced by the absent token.
168-
//
169-
// We count nodes instead of tokens because tokens are often not unique.
170-
strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
171-
exists(ASTNode node |
172-
node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
173-
token = getTokenizedAstNode(node) and
174-
location = node.getLocation()
175-
)
176-
}
177-
178-
/**
179-
* Gets the body token feature for the specified entity.
180-
*
181-
* This is a string containing natural language tokens in the order that they appear in the source code for the entity.
182-
*/
183-
string getBodyTokenFeatureForEntity(DatabaseFeatures::Entity entity) {
184-
result =
185-
strictconcat(string token, Location l |
186-
bodyTokens(entity, l, token)
187-
|
188-
token, " "
189-
order by
190-
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
191-
l.getEndColumn(), token
192-
)
193-
}
194-
}
195-
196128
/**
197129
* This module provides functionality for getting a representation of the access path of nodes
198130
* within the program.

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll

Lines changed: 6 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -5,88 +5,21 @@
55
*/
66

77
private import javascript
8-
import BaseScoring
9-
import CodeToFeatures
10-
import EndpointFeatures as EndpointFeatures
11-
import EndpointTypes
8+
private import BaseScoring
9+
private import EndpointFeatures as EndpointFeatures
10+
private import FeaturizationConfig
11+
private import EndpointTypes
1212

1313
private string getACompatibleModelChecksum() {
1414
availableMlModels(result, "javascript", _, "atm-endpoint-scoring")
1515
}
1616

17-
/**
18-
* The maximum number of AST nodes an function containing an endpoint should have before we should
19-
* choose a smaller function to represent the endpoint.
20-
*
21-
* This is intended to represent a balance in terms of the amount of context we provide to the
22-
* model: we don't want the function to be too small, because then it doesn't contain very much
23-
* context and miss useful information, but also we don't want it to be too large, because then
24-
* there's likely to be a lot of irrelevant or very loosely related context.
25-
*/
26-
private int getMaxNumAstNodes() { result = 1024 }
27-
28-
/**
29-
* Returns the number of AST nodes contained within the specified function.
30-
*/
31-
private int getNumAstNodesInFunction(Function function) {
32-
// Restrict the values `function` can take on
33-
function = EndpointToFunction::getAFunctionForEndpoint(_) and
34-
result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function))
35-
}
36-
37-
/**
38-
* Get the enclosing function for an endpoint.
39-
*
40-
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
41-
*
42-
* We try to use the largest function containing the endpoint that's below the AST node limit
43-
* defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
44-
* within the source code.
45-
*
46-
* If no functions are smaller than the AST node limit, then we use the smallest function containing
47-
* the endpoint.
48-
*/
49-
Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
50-
// Check whether there's a function containing the endpoint that's smaller than the AST node
51-
// limit.
52-
if
53-
getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <=
54-
getMaxNumAstNodes()
55-
then
56-
// Use the largest function smaller than the AST node limit, resolving ties using the function
57-
// that appears first in the source code.
58-
result =
59-
min(Function function, int numAstNodes, Location l |
60-
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
61-
numAstNodes = getNumAstNodesInFunction(function) and
62-
numAstNodes <= getMaxNumAstNodes() and
63-
l = function.getLocation()
64-
|
65-
function
66-
order by
67-
numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
68-
)
69-
else
70-
// Use the smallest function, resolving ties using the function that appears first in the source
71-
// code.
72-
result =
73-
min(Function function, int numAstNodes, Location l |
74-
function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
75-
numAstNodes = getNumAstNodesInFunction(function) and
76-
l = function.getLocation()
77-
|
78-
function
79-
order by
80-
numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
81-
)
82-
}
83-
8417
module ModelScoring {
8518
/**
8619
* A featurization config that only featurizes new candidate endpoints that are part of a flow
8720
* path.
8821
*/
89-
class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
22+
class RelevantFeaturizationConfig extends FeaturizationConfig {
9023
RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
9124

9225
override DataFlow::Node getAnEndpointToFeaturize() {
@@ -97,7 +30,7 @@ module ModelScoring {
9730
}
9831

9932
DataFlow::Node getARequestedEndpoint() {
100-
result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize()
33+
result = any(FeaturizationConfig cfg).getAnEndpointToFeaturize()
10134
}
10235

10336
private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import javascript
2+
3+
/**
4+
* A configuration that defines which endpoints should be featurized.
5+
*
6+
* This is used as a performance optimization to ensure that we only featurize the endpoints we need
7+
* to featurize.
8+
*/
9+
abstract class FeaturizationConfig extends string {
10+
bindingset[this]
11+
FeaturizationConfig() { any() }
12+
13+
abstract DataFlow::Node getAnEndpointToFeaturize();
14+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/*
2+
* FunctionBodyFeatures.qll
3+
*
4+
* Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.
5+
*/
6+
7+
import javascript
8+
private import CodeToFeatures
9+
private import FeaturizationConfig
10+
11+
string getTokenizedAstNode(ASTNode node) {
12+
// NB: Unary and binary operator expressions e.g. -a, a + b and compound
13+
// assignments e.g. a += b can be identified by the expression type.
14+
result = node.(Identifier).getName()
15+
or
16+
// Computed property accesses for which we can predetermine the property being accessed.
17+
// NB: May alias with operators e.g. could have '+' as a property name.
18+
result = node.(IndexExpr).getPropertyName()
19+
or
20+
// We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
21+
result = node.(NumberLiteral).getRawValue()
22+
or
23+
// We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
24+
not node instanceof NumberLiteral and
25+
result = node.(Literal).getValue()
26+
or
27+
result = node.(TemplateElement).getRawValue()
28+
}
29+
30+
/** Returns an AST node within the function `f` that we should featurize. */
31+
pragma[inline]
32+
ASTNode getAnASTNodeToFeaturize(Function f) {
33+
result.getParent*() = f and
34+
not result = f.getIdentifier() and
35+
exists(getTokenizedAstNode(result))
36+
}
37+
38+
/**
39+
* Get a function containing the endpoint that is suitable for featurization. In general,
40+
* this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
41+
*/
42+
Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
43+
result = endpoint.getContainer().getEnclosingContainer*()
44+
}
45+
46+
/**
47+
* The maximum number of AST nodes an function containing an endpoint should have before we should
48+
* choose a smaller function to represent the endpoint.
49+
*
50+
* This is intended to represent a balance in terms of the amount of context we provide to the
51+
* model: we don't want the function to be too small, because then it doesn't contain very much
52+
* context and miss useful information, but also we don't want it to be too large, because then
53+
* there's likely to be a lot of irrelevant or very loosely related context.
54+
*/
55+
private int getMaxNumAstNodes() { result = 1024 }
56+
57+
/**
58+
* Returns the number of AST nodes contained within the specified function.
59+
*/
60+
private int getNumAstNodesInFunction(Function function) {
61+
// Restrict the values `function` can take on
62+
function = getAFunctionForEndpoint(_) and
63+
result = count(getAnASTNodeToFeaturize(function))
64+
}
65+
66+
/**
67+
* Get the enclosing function for an endpoint.
68+
*
69+
* This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
70+
*
71+
* We try to use the largest function containing the endpoint that's below the AST node limit
72+
* defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
73+
* within the source code.
74+
*
75+
* If no functions are smaller than the AST node limit, then we use the smallest function containing
76+
* the endpoint.
77+
*/
78+
Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
79+
// Check whether there's a function containing the endpoint that's smaller than the AST node
80+
// limit.
81+
if getNumAstNodesInFunction(getAFunctionForEndpoint(endpoint)) <= getMaxNumAstNodes()
82+
then
83+
// Use the largest function smaller than the AST node limit, resolving ties using the function
84+
// that appears first in the source code.
85+
result =
86+
min(Function function, int numAstNodes, Location l |
87+
function = getAFunctionForEndpoint(endpoint) and
88+
numAstNodes = getNumAstNodesInFunction(function) and
89+
numAstNodes <= getMaxNumAstNodes() and
90+
l = function.getLocation()
91+
|
92+
function
93+
order by
94+
numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
95+
)
96+
else
97+
// Use the smallest function, resolving ties using the function that appears first in the source
98+
// code.
99+
result =
100+
min(Function function, int numAstNodes, Location l |
101+
function = getAFunctionForEndpoint(endpoint) and
102+
numAstNodes = getNumAstNodesInFunction(function) and
103+
l = function.getLocation()
104+
|
105+
function
106+
order by
107+
numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
108+
)
109+
}
110+
111+
/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
112+
predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
113+
// Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
114+
entity.getDefinedFunction() =
115+
getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
116+
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
117+
// approximates the behavior of the classifer on non-generic body features where large body
118+
// features are replaced by the absent token.
119+
//
120+
// We count nodes instead of tokens because tokens are often not unique.
121+
strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
122+
exists(ASTNode node |
123+
node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
124+
token = getTokenizedAstNode(node) and
125+
location = node.getLocation()
126+
)
127+
}

0 commit comments

Comments
 (0)