|
| 1 | +/* |
| 2 | + * FunctionBodyFeatures.qll |
| 3 | + * |
| 4 | + * Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features. |
| 5 | + */ |
| 6 | + |
| 7 | +import javascript |
| 8 | +private import CodeToFeatures |
| 9 | +private import FeaturizationConfig |
| 10 | + |
| 11 | +string getTokenizedAstNode(ASTNode node) { |
| 12 | + // NB: Unary and binary operator expressions e.g. -a, a + b and compound |
| 13 | + // assignments e.g. a += b can be identified by the expression type. |
| 14 | + result = node.(Identifier).getName() |
| 15 | + or |
| 16 | + // Computed property accesses for which we can predetermine the property being accessed. |
| 17 | + // NB: May alias with operators e.g. could have '+' as a property name. |
| 18 | + result = node.(IndexExpr).getPropertyName() |
| 19 | + or |
| 20 | + // We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`. |
| 21 | + result = node.(NumberLiteral).getRawValue() |
| 22 | + or |
| 23 | + // We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation. |
| 24 | + not node instanceof NumberLiteral and |
| 25 | + result = node.(Literal).getValue() |
| 26 | + or |
| 27 | + result = node.(TemplateElement).getRawValue() |
| 28 | +} |
| 29 | + |
| 30 | +/** Returns an AST node within the function `f` that we should featurize. */ |
| 31 | +pragma[inline] |
| 32 | +ASTNode getAnASTNodeToFeaturize(Function f) { |
| 33 | + result.getParent*() = f and |
| 34 | + not result = f.getIdentifier() and |
| 35 | + exists(getTokenizedAstNode(result)) |
| 36 | +} |
| 37 | + |
| 38 | +/** |
| 39 | + * Get a function containing the endpoint that is suitable for featurization. In general, |
| 40 | + * this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint. |
| 41 | + */ |
| 42 | +Function getAFunctionForEndpoint(DataFlow::Node endpoint) { |
| 43 | + result = endpoint.getContainer().getEnclosingContainer*() |
| 44 | +} |
| 45 | + |
| 46 | +/** |
| 47 | + * The maximum number of AST nodes an function containing an endpoint should have before we should |
| 48 | + * choose a smaller function to represent the endpoint. |
| 49 | + * |
| 50 | + * This is intended to represent a balance in terms of the amount of context we provide to the |
| 51 | + * model: we don't want the function to be too small, because then it doesn't contain very much |
| 52 | + * context and miss useful information, but also we don't want it to be too large, because then |
| 53 | + * there's likely to be a lot of irrelevant or very loosely related context. |
| 54 | + */ |
| 55 | +private int getMaxNumAstNodes() { result = 1024 } |
| 56 | + |
| 57 | +/** |
| 58 | + * Returns the number of AST nodes contained within the specified function. |
| 59 | + */ |
| 60 | +private int getNumAstNodesInFunction(Function function) { |
| 61 | + // Restrict the values `function` can take on |
| 62 | + function = getAFunctionForEndpoint(_) and |
| 63 | + result = count(getAnASTNodeToFeaturize(function)) |
| 64 | +} |
| 65 | + |
| 66 | +/** |
| 67 | + * Get the enclosing function for an endpoint. |
| 68 | + * |
| 69 | + * This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features. |
| 70 | + * |
| 71 | + * We try to use the largest function containing the endpoint that's below the AST node limit |
| 72 | + * defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first |
| 73 | + * within the source code. |
| 74 | + * |
| 75 | + * If no functions are smaller than the AST node limit, then we use the smallest function containing |
| 76 | + * the endpoint. |
| 77 | + */ |
| 78 | +Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) { |
| 79 | + // Check whether there's a function containing the endpoint that's smaller than the AST node |
| 80 | + // limit. |
| 81 | + if getNumAstNodesInFunction(getAFunctionForEndpoint(endpoint)) <= getMaxNumAstNodes() |
| 82 | + then |
| 83 | + // Use the largest function smaller than the AST node limit, resolving ties using the function |
| 84 | + // that appears first in the source code. |
| 85 | + result = |
| 86 | + min(Function function, int numAstNodes, Location l | |
| 87 | + function = getAFunctionForEndpoint(endpoint) and |
| 88 | + numAstNodes = getNumAstNodesInFunction(function) and |
| 89 | + numAstNodes <= getMaxNumAstNodes() and |
| 90 | + l = function.getLocation() |
| 91 | + | |
| 92 | + function |
| 93 | + order by |
| 94 | + numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn() |
| 95 | + ) |
| 96 | + else |
| 97 | + // Use the smallest function, resolving ties using the function that appears first in the source |
| 98 | + // code. |
| 99 | + result = |
| 100 | + min(Function function, int numAstNodes, Location l | |
| 101 | + function = getAFunctionForEndpoint(endpoint) and |
| 102 | + numAstNodes = getNumAstNodesInFunction(function) and |
| 103 | + l = function.getLocation() |
| 104 | + | |
| 105 | + function |
| 106 | + order by |
| 107 | + numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn() |
| 108 | + ) |
| 109 | +} |
| 110 | + |
| 111 | +/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */ |
| 112 | +predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) { |
| 113 | + // Performance optimization: Restrict the set of entities to those containing an endpoint to featurize. |
| 114 | + entity.getDefinedFunction() = |
| 115 | + getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and |
| 116 | + // Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This |
| 117 | + // approximates the behavior of the classifer on non-generic body features where large body |
| 118 | + // features are replaced by the absent token. |
| 119 | + // |
| 120 | + // We count nodes instead of tokens because tokens are often not unique. |
| 121 | + strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and |
| 122 | + exists(ASTNode node | |
| 123 | + node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and |
| 124 | + token = getTokenizedAstNode(node) and |
| 125 | + location = node.getLocation() |
| 126 | + ) |
| 127 | +} |
0 commit comments