Move function body features into their own file

henrymercer · henrymercer · commit 9e50ce873d5c · 2022-01-12T12:47:28.000Z
diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/BaseScoring.qll
@@ -14,22 +14,6 @@ external predicate availableMlModels(
 /** Get the ATM configuration. */
 ATMConfig getCfg() { any() }
 
-/**
- * This module provides functionality that takes an endpoint and provides an function that encloses
- * that endpoint.
- */
-module EndpointToFunction {
-  private import CodeToFeatures
-
-  /**
-   * Get a function containing the endpoint that is suitable for featurization. In general,
-   * this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
-   */
-  Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
-    result = endpoint.getContainer().getEnclosingContainer*()
-  }
-}
-
 /**
  * Scoring information produced by a scoring model.
  *
diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointFeatures.qll
@@ -5,21 +5,9 @@
  */
 
 import javascript
-import CodeToFeatures
-private import EndpointScoring
-
-/**
- * A configuration that defines which endpoints should be featurized.
- *
- * This is used as a performance optimization to ensure that we only featurize the endpoints we need
- * to featurize.
- */
-abstract class FeaturizationConfig extends string {
-  bindingset[this]
-  FeaturizationConfig() { any() }
-
-  abstract DataFlow::Node getAnEndpointToFeaturize();
-}
+private import CodeToFeatures
+private import FeaturizationConfig
+private import FunctionBodyFeatures as FunctionBodyFeatures
 
 /**
  * Gets the value of the token-based feature named `featureName` for the endpoint `endpoint`.
@@ -32,15 +20,24 @@ private string getTokenFeature(DataFlow::Node endpoint, string featureName) {
   (
     // Features for endpoints that are contained within a function.
     exists(DatabaseFeatures::Entity entity |
-      entity.getDefinedFunction() = getRepresentativeFunctionForEndpoint(endpoint)
+      entity.getDefinedFunction() =
+        FunctionBodyFeatures::getRepresentativeFunctionForEndpoint(endpoint)
     |
       // The name of the function that encloses the endpoint.
       featureName = "enclosingFunctionName" and result = entity.getName()
       or
       // A feature containing natural language tokens from the function that encloses the endpoint in
       // the order that they appear in the source code.
       featureName = "enclosingFunctionBody" and
-      result = unique(string x | x = FunctionBodies::getBodyTokenFeatureForEntity(entity))
+      result =
+        strictconcat(string token, Location l |
+          FunctionBodyFeatures::bodyTokens(entity, l, token)
+        |
+          token, " "
+          order by
+            l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
+            l.getEndColumn(), token
+        )
     )
     or
     result =
@@ -128,71 +125,6 @@ private string getACallBasedTokenFeatureComponent(
   )
 }
 
-/** This module provides functionality for getting the function body feature associated with a particular entity. */
-module FunctionBodies {
-  string getTokenizedAstNode(ASTNode node) {
-    // NB: Unary and binary operator expressions e.g. -a, a + b and compound
-    // assignments e.g. a += b can be identified by the expression type.
-    result = node.(Identifier).getName()
-    or
-    // Computed property accesses for which we can predetermine the property being accessed.
-    // NB: May alias with operators e.g. could have '+' as a property name.
-    result = node.(IndexExpr).getPropertyName()
-    or
-    // We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
-    result = node.(NumberLiteral).getRawValue()
-    or
-    // We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
-    not node instanceof NumberLiteral and
-    result = node.(Literal).getValue()
-    or
-    result = node.(TemplateElement).getRawValue()
-  }
-
-  /** Returns an AST node within the function `f` that we should featurize. */
-  pragma[inline]
-  ASTNode getAnASTNodeToFeaturize(Function f) {
-    result.getParent*() = f and
-    not result = f.getIdentifier() and
-    exists(getTokenizedAstNode(result))
-  }
-
-  /** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
-  private predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
-    // Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
-    entity.getDefinedFunction() =
-      getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
-    // Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
-    // approximates the behavior of the classifer on non-generic body features where large body
-    // features are replaced by the absent token.
-    //
-    // We count nodes instead of tokens because tokens are often not unique.
-    strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
-    exists(ASTNode node |
-      node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
-      token = getTokenizedAstNode(node) and
-      location = node.getLocation()
-    )
-  }
-
-  /**
-   * Gets the body token feature for the specified entity.
-   *
-   * This is a string containing natural language tokens in the order that they appear in the source code for the entity.
-   */
-  string getBodyTokenFeatureForEntity(DatabaseFeatures::Entity entity) {
-    result =
-      strictconcat(string token, Location l |
-        bodyTokens(entity, l, token)
-      |
-        token, " "
-        order by
-          l.getFile().getAbsolutePath(), l.getStartLine(), l.getStartColumn(), l.getEndLine(),
-          l.getEndColumn(), token
-      )
-  }
-}
-
 /**
  * This module provides functionality for getting a representation of the access path of nodes
  * within the program.
diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/EndpointScoring.qll
@@ -5,88 +5,21 @@
  */
 
 private import javascript
-import BaseScoring
-import CodeToFeatures
-import EndpointFeatures as EndpointFeatures
-import EndpointTypes
+private import BaseScoring
+private import EndpointFeatures as EndpointFeatures
+private import FeaturizationConfig
+private import EndpointTypes
 
 private string getACompatibleModelChecksum() {
   availableMlModels(result, "javascript", _, "atm-endpoint-scoring")
 }
 
-/**
- * The maximum number of AST nodes an function containing an endpoint should have before we should
- * choose a smaller function to represent the endpoint.
- *
- * This is intended to represent a balance in terms of the amount of context we provide to the
- * model: we don't want the function to be too small, because then it doesn't contain very much
- * context and miss useful information, but also we don't want it to be too large, because then
- * there's likely to be a lot of irrelevant or very loosely related context.
- */
-private int getMaxNumAstNodes() { result = 1024 }
-
-/**
- * Returns the number of AST nodes contained within the specified function.
- */
-private int getNumAstNodesInFunction(Function function) {
-  // Restrict the values `function` can take on
-  function = EndpointToFunction::getAFunctionForEndpoint(_) and
-  result = count(EndpointFeatures::FunctionBodies::getAnASTNodeToFeaturize(function))
-}
-
-/**
- * Get the enclosing function for an endpoint.
- * 
- * This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
- *
- * We try to use the largest function containing the endpoint that's below the AST node limit
- * defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
- * within the source code.
- *
- * If no functions are smaller than the AST node limit, then we use the smallest function containing
- * the endpoint.
- */
-Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
-  // Check whether there's a function containing the endpoint that's smaller than the AST node
-  // limit.
-  if
-    getNumAstNodesInFunction(EndpointToFunction::getAFunctionForEndpoint(endpoint)) <=
-      getMaxNumAstNodes()
-  then
-    // Use the largest function smaller than the AST node limit, resolving ties using the function
-    // that appears first in the source code.
-    result =
-      min(Function function, int numAstNodes, Location l |
-        function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
-        numAstNodes = getNumAstNodesInFunction(function) and
-        numAstNodes <= getMaxNumAstNodes() and
-        l = function.getLocation()
-      |
-        function
-        order by
-          numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
-      )
-  else
-    // Use the smallest function, resolving ties using the function that appears first in the source
-    // code.
-    result =
-      min(Function function, int numAstNodes, Location l |
-        function = EndpointToFunction::getAFunctionForEndpoint(endpoint) and
-        numAstNodes = getNumAstNodesInFunction(function) and
-        l = function.getLocation()
-      |
-        function
-        order by
-          numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
-      )
-}
-
 module ModelScoring {
   /**
    * A featurization config that only featurizes new candidate endpoints that are part of a flow
    * path.
    */
-  class RelevantFeaturizationConfig extends EndpointFeatures::FeaturizationConfig {
+  class RelevantFeaturizationConfig extends FeaturizationConfig {
     RelevantFeaturizationConfig() { this = "RelevantFeaturization" }
 
     override DataFlow::Node getAnEndpointToFeaturize() {
@@ -97,7 +30,7 @@ module ModelScoring {
   }
 
   DataFlow::Node getARequestedEndpoint() {
-    result = any(EndpointFeatures::FeaturizationConfig cfg).getAnEndpointToFeaturize()
+    result = any(FeaturizationConfig cfg).getAnEndpointToFeaturize()
   }
 
   private int getARequestedEndpointType() { result = any(EndpointType type).getEncoding() }
diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FeaturizationConfig.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FeaturizationConfig.qll
@@ -0,0 +1,14 @@
+import javascript
+
+/**
+ * A configuration that defines which endpoints should be featurized.
+ *
+ * This is used as a performance optimization to ensure that we only featurize the endpoints we need
+ * to featurize.
+ */
+abstract class FeaturizationConfig extends string {
+  bindingset[this]
+  FeaturizationConfig() { any() }
+
+  abstract DataFlow::Node getAnEndpointToFeaturize();
+}
diff --git a/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll b/javascript/ql/experimental/adaptivethreatmodeling/lib/experimental/adaptivethreatmodeling/FunctionBodyFeatures.qll
@@ -0,0 +1,127 @@
+/*
+ * FunctionBodyFeatures.qll
+ *
+ * Contains logic relating to the `enclosingFunctionBody` and `enclosingFunctionName` features.
+ */
+
+import javascript
+private import CodeToFeatures
+private import FeaturizationConfig
+
+string getTokenizedAstNode(ASTNode node) {
+  // NB: Unary and binary operator expressions e.g. -a, a + b and compound
+  // assignments e.g. a += b can be identified by the expression type.
+  result = node.(Identifier).getName()
+  or
+  // Computed property accesses for which we can predetermine the property being accessed.
+  // NB: May alias with operators e.g. could have '+' as a property name.
+  result = node.(IndexExpr).getPropertyName()
+  or
+  // We use `getRawValue` to give us distinct representations for `0xa`, `0xA`, and `10`.
+  result = node.(NumberLiteral).getRawValue()
+  or
+  // We use `getValue` rather than `getRawValue` so we assign `"a"` and `'a'` the same representation.
+  not node instanceof NumberLiteral and
+  result = node.(Literal).getValue()
+  or
+  result = node.(TemplateElement).getRawValue()
+}
+
+/** Returns an AST node within the function `f` that we should featurize. */
+pragma[inline]
+ASTNode getAnASTNodeToFeaturize(Function f) {
+  result.getParent*() = f and
+  not result = f.getIdentifier() and
+  exists(getTokenizedAstNode(result))
+}
+
+/**
+ * Get a function containing the endpoint that is suitable for featurization. In general,
+ * this associates an endpoint to multiple functions, since there may be more than one multiple entities to a single endpoint.
+ */
+Function getAFunctionForEndpoint(DataFlow::Node endpoint) {
+  result = endpoint.getContainer().getEnclosingContainer*()
+}
+
+/**
+ * The maximum number of AST nodes an function containing an endpoint should have before we should
+ * choose a smaller function to represent the endpoint.
+ *
+ * This is intended to represent a balance in terms of the amount of context we provide to the
+ * model: we don't want the function to be too small, because then it doesn't contain very much
+ * context and miss useful information, but also we don't want it to be too large, because then
+ * there's likely to be a lot of irrelevant or very loosely related context.
+ */
+private int getMaxNumAstNodes() { result = 1024 }
+
+/**
+ * Returns the number of AST nodes contained within the specified function.
+ */
+private int getNumAstNodesInFunction(Function function) {
+  // Restrict the values `function` can take on
+  function = getAFunctionForEndpoint(_) and
+  result = count(getAnASTNodeToFeaturize(function))
+}
+
+/**
+ * Get the enclosing function for an endpoint.
+ *
+ * This is used to compute the `enclosingFunctionBody` and `enclosingFunctionName` features.
+ *
+ * We try to use the largest function containing the endpoint that's below the AST node limit
+ * defined in `getMaxNumAstNodes`. In the event of a tie, we use the function that appears first
+ * within the source code.
+ *
+ * If no functions are smaller than the AST node limit, then we use the smallest function containing
+ * the endpoint.
+ */
+Function getRepresentativeFunctionForEndpoint(DataFlow::Node endpoint) {
+  // Check whether there's a function containing the endpoint that's smaller than the AST node
+  // limit.
+  if getNumAstNodesInFunction(getAFunctionForEndpoint(endpoint)) <= getMaxNumAstNodes()
+  then
+    // Use the largest function smaller than the AST node limit, resolving ties using the function
+    // that appears first in the source code.
+    result =
+      min(Function function, int numAstNodes, Location l |
+        function = getAFunctionForEndpoint(endpoint) and
+        numAstNodes = getNumAstNodesInFunction(function) and
+        numAstNodes <= getMaxNumAstNodes() and
+        l = function.getLocation()
+      |
+        function
+        order by
+          numAstNodes desc, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
+      )
+  else
+    // Use the smallest function, resolving ties using the function that appears first in the source
+    // code.
+    result =
+      min(Function function, int numAstNodes, Location l |
+        function = getAFunctionForEndpoint(endpoint) and
+        numAstNodes = getNumAstNodesInFunction(function) and
+        l = function.getLocation()
+      |
+        function
+        order by
+          numAstNodes, l.getStartLine(), l.getStartColumn(), l.getEndLine(), l.getEndColumn()
+      )
+}
+
+/** Holds if `location` is the location of an AST node within the entity `entity` and `token` is a node attribute associated with that AST node. */
+predicate bodyTokens(DatabaseFeatures::Entity entity, Location location, string token) {
+  // Performance optimization: Restrict the set of entities to those containing an endpoint to featurize.
+  entity.getDefinedFunction() =
+    getRepresentativeFunctionForEndpoint(any(FeaturizationConfig cfg).getAnEndpointToFeaturize()) and
+  // Performance optimization: If a function has more than 256 body subtokens, then featurize it as absent. This
+  // approximates the behavior of the classifer on non-generic body features where large body
+  // features are replaced by the absent token.
+  //
+  // We count nodes instead of tokens because tokens are often not unique.
+  strictcount(getAnASTNodeToFeaturize(entity.getDefinedFunction())) <= 256 and
+  exists(ASTNode node |
+    node = getAnASTNodeToFeaturize(entity.getDefinedFunction()) and
+    token = getTokenizedAstNode(node) and
+    location = node.getLocation()
+  )
+}