Skip to content

Commit 92d6fec

Browse files
committed
Optimize performance of body tokens
The refactoring to remove the `CodeToFeatures` AST reintroduced a performance problem. This commit resolves it by pushing size restrictions into intermediate predicates.
1 parent 7f61738 commit 92d6fec

File tree

2 files changed

+32
-20
lines changed

2 files changed

+32
-20
lines changed

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,7 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
2727
// A feature containing natural language tokens from the function that encloses the endpoint in
2828
// the order that they appear in the source code.
2929
featureName = "enclosingFunctionBody" and
30-
result =
31-
strictconcat(string token, Location l |
32-
FunctionBodyFeatures::bodyTokens(function, l, token)
33-
|
34-
token, " "
35-
order by
36-
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
37-
l.getEndColumn(), token
38-
)
30+
result = FunctionBodyFeatures::getBodyTokensFeature(function)
3931
)
4032
or
4133
result =

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll

Lines changed: 31 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,17 @@ string getTokenizedAstNode(ASTNode node) {
3030
pragma[inline]
3131
ASTNode getAnASTNodeToFeaturize(Function f) {
3232
result.getParent*() = f and
33-
not result = f.getIdentifier() and
34-
exists(getTokenizedAstNode(result))
33+
// Don't featurize the function name as part of the function body tokens
34+
not result = f.getIdentifier()
3535
}
3636

3737
/**
3838
* Get a function containing the endpoint that is suitable for featurization. In general, this
3939
* can associate an endpoint to multiple functions, since functions can be nested in JavaScript.
4040
*/
4141
Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
42+
// Performance optimization: Restrict the set of endpoints to the endpoints to featurize.
43+
endpoint = any(FeaturizationConfig cfg).getAnEndpointToFeaturize() and
4244
result = endpoint.getContainer().getEnclosingContainer*()
4345
}
4446

@@ -107,20 +109,38 @@ Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
107109
)
108110
}
109111

110-
/** Holds if `location` is the location of an AST node within the function `function` and `token` is a node attribute associated with that AST node. */
111-
predicate bodyTokens(Function function, Location location, string token) {
112+
/** Returns an AST node within the function `f` that an associated token feature. */
113+
ASTNode getAnASTNodeWithAFeature(Function f) {
112114
// Performance optimization: Restrict the set of functions to those containing an endpoint to featurize.
113-
function =
114-
getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
115+
f = getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
116+
result = getAnASTNodeToFeaturize(f)
117+
}
118+
119+
/** Holds if `location` is the location of an AST node within the function `function` and `token` is a node attribute associated with that AST node. */
120+
string getBodyTokensFeature(Function function) {
115121
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
116122
// approximates the behavior of the classifer on non-generic body features where large body
117123
// features are replaced by the absent token.
118124
//
119125
// We count nodes instead of tokens because tokens are often not unique.
120-
strictcount(getAnASTNodeToFeaturize(function)) <= 256 and
121-
exists(ASTNode node |
126+
strictcount(ASTNode node |
122127
node = getAnASTNodeToFeaturize(function) and
123-
token = getTokenizedAstNode(node) and
124-
location = node.getLocation()
125-
)
128+
exists(getTokenizedAstNode(node))
129+
) <= 256 and
130+
result =
131+
strictconcat(Location l, string token |
132+
// The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
133+
// same location featurizing to the same token. By using a nested exists, we take only unique
134+
// (location, token) pairs.
135+
exists(ASTNode node |
136+
node = getAnASTNodeToFeaturize(function) and
137+
token = getTokenizedAstNode(node) and
138+
l = node.getLocation()
139+
)
140+
|
141+
token, " "
142+
order by
143+
l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
144+
l.getEndColumn(), token
145+
)
126146
}

0 commit comments

Comments
 (0)