@@ -127,6 +127,7 @@ ASTNode getAnASTNodeWithAFeature(Function f) {
127
127
result = getAnASTNodeToFeaturize ( f )
128
128
}
129
129
130
+ /** Returns the number of source-code characters in a function. */
130
131
int getNumCharsInFunction ( Function f ) {
131
132
result =
132
133
strictsum ( ASTNode node | node = getAnASTNodeWithAFeature ( f ) | getTokenizedAstNode ( node ) .length ( ) )
@@ -135,10 +136,6 @@ int getNumCharsInFunction(Function f) {
135
136
// Evaluator string limit is 5395415 characters. We choose a limit lower than this.
136
137
private int getMaxChars ( ) { result = 1000000 }
137
138
138
- Function getFeaturizableFunction ( Function f ) {
139
- result = f and getNumCharsInFunction ( f ) <= getMaxChars ( )
140
- }
141
-
142
139
/**
143
140
* Returns a featurized representation of the function that can be used to populate the
144
141
* `enclosingFunctionBody` feature for an endpoint.
@@ -147,13 +144,15 @@ string getBodyTokensFeature(Function function) {
147
144
// Performance optimization: If a function has more than 256 body subtokens, then featurize it as
148
145
// absent. This approximates the behavior of the classifer on non-generic body features where
149
146
// large body features are replaced by the absent token.
147
+ //
148
+ // We count nodes instead of tokens because tokens are often not unique.
150
149
strictcount ( ASTNode node |
151
150
node = getAnASTNodeToFeaturize ( function ) and
152
151
exists ( getTokenizedAstNode ( node ) )
153
152
) <= 256 and
154
153
// Performance optimization: If a function has more than getMaxChars() characters in its body subtokens,
155
154
// then featurize it as absent.
156
- function = getFeaturizableFunction ( function ) and
155
+ getNumCharsInFunction ( function ) <= getMaxChars ( ) and
157
156
result =
158
157
strictconcat ( Location l , string token |
159
158
// The use of a nested exists here allows us to avoid duplicates due to two AST nodes in the
0 commit comments