Skip to content

Commit 83ecc06

Browse files
committed
restrict size of strings
1 parent aceeb73 commit 83ecc06

File tree

1 file changed

+19
-14
lines changed

1 file changed

+19
-14
lines changed

javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,7 @@ pragma[inline]
3838
ASTNode getAnASTNodeToFeaturize(Function f) {
3939
result.getParent*() = f and
4040
// Don't featurize the function name as part of the function body tokens
41-
not result = f.getIdentifier() and
42-
// Don't include nodes with names that are too long (to avoid catastrophic error due to tokenized strings getting too long)
43-
result.toString().length() < 500000
41+
not result = f.getIdentifier()
4442
}
4543

4644
/**
@@ -129,22 +127,29 @@ ASTNode getAnASTNodeWithAFeature(Function f) {
129127
result = getAnASTNodeToFeaturize(f)
130128
}
131129

130+
int getNumCharsInFunction(Function f) {
131+
result = strictsum(int i |
132+
exists(ASTNode node | node = getAnASTNodeWithAFeature(f) and i = getTokenizedAstNode(node).length()) |
133+
i
134+
)
135+
}
136+
137+
// Evaluator string limit is 5395415 characters. We choose a limit lower than this.
138+
private int getMaxChars() { result = 1000000 }
139+
140+
Function getFeaturizableFunction(Function f) {
141+
result = f and getNumCharsInFunction(f) <= getMaxChars()
142+
}
143+
132144
/**
133145
* Returns a featurized representation of the function that can be used to populate the
134146
* `enclosingFunctionBody` feature for an endpoint.
135147
*/
136148
string getBodyTokensFeature(Function function) {
137-
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as
138-
// absent. This approximates the behavior of the classifer on non-generic body features where
139-
// large body features are replaced by the absent token.
140-
//
141-
// We count nodes instead of tokens because tokens are often not unique.
142-
strictcount(ASTNode node |
143-
node = getAnASTNodeToFeaturize(function) and
144-
exists(getTokenizedAstNode(node))
145-
) <= 256 and
146-
result =
147-
strictconcat(Location l, string token |
149+
// Performance optimization: If a function has more than getMaxChars() characters in its body subtokens,
150+
// then featurize it as absent.
151+
function = getFeaturizableFunction(function) and
152+
result = strictconcat(Location l, string token |
148153
// The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
149154
// same location featurizing to the same token. By using a nested exists, we take only unique
150155
// (location, token) pairs.

0 commit comments

Comments
 (0)