|
| 1 | +/* |
| 2 | + * For internal use only. |
| 3 | + * |
| 4 | + * Extracts training data we can use to train ML models for ML-powered queries. |
| 5 | + */ |
| 6 | + |
| 7 | +import javascript |
| 8 | +import experimental.adaptivethreatmodeling.EndpointCharacteristics |
| 9 | +import experimental.adaptivethreatmodeling.EndpointFeatures as EndpointFeatures |
| 10 | +import NoFeaturizationRestrictionsConfig |
| 11 | +private import Exclusions as Exclusions |
| 12 | +private import Queries |
| 13 | +private import experimental.adaptivethreatmodeling.NosqlInjectionATM as NosqlInjectionAtm |
| 14 | +private import experimental.adaptivethreatmodeling.SqlInjectionATM as SqlInjectionAtm |
| 15 | +private import experimental.adaptivethreatmodeling.TaintedPathATM as TaintedPathAtm |
| 16 | +private import experimental.adaptivethreatmodeling.XssATM as XssAtm |
| 17 | + |
| 18 | +/** |
| 19 | + * Gets the set of featureName-featureValue pairs for each endpoint in the training set. |
| 20 | + * |
| 21 | + * `EndpointFeatures::tokenFeatures` has no results when `featureName` is absent for the endpoint |
| 22 | + * `endpoint`. To preserve compatibility with the data pipeline, this relation will instead set |
| 23 | + * `featureValue` to the empty string in this case. |
| 24 | + */ |
| 25 | +query predicate tokenFeatures(DataFlow::Node endpoint, string featureName, string featureValue) { |
| 26 | + trainingEndpoints(endpoint, _, _) and |
| 27 | + ( |
| 28 | + EndpointFeatures::tokenFeatures(endpoint, featureName, featureValue) |
| 29 | + or |
| 30 | + // Performance note: this creates a Cartesian product between `endpoint` and `featureName`. |
| 31 | + featureName = EndpointFeatures::getASupportedFeatureName() and |
| 32 | + not exists(string value | EndpointFeatures::tokenFeatures(endpoint, featureName, value)) and |
| 33 | + featureValue = "" |
| 34 | + ) |
| 35 | +} |
| 36 | + |
| 37 | +/** |
| 38 | + * Holds if the given endpoint should be included in the training set as a sample belonging to endpointClass, and has |
| 39 | + * the given characteristic. This query uses the endpoint characteristics to select and label endpoints for the training |
| 40 | + * set, and provides a list of characteristics for each endpoint in the training set, which is used in the modeling |
| 41 | + * code. |
| 42 | + * |
| 43 | + * Params: |
| 44 | + * endpoint: The endpoint to include / exclude. |
| 45 | + * endpointClass: The sink type. Each EndpointType has a predicate getEncoding, which specifies the classifier class |
| 46 | + * for this sink type. Class 0 is the negative class (non-sink). Each positive int corresponds to a single sink type. |
| 47 | + * This gives us the label for the endpoint in the training data. |
| 48 | + * characteristic: Provides the list of characteristics that apply to the endpoint, which the modeling code currently |
| 49 | + * uses for type balancing. |
| 50 | + * |
| 51 | + * Note: This predicate will produce multiple tuples for endpoints that have multiple characteristics, which we must |
| 52 | + * then group together into a list of characteristics. |
| 53 | + */ |
| 54 | +query predicate trainingEndpoints( |
| 55 | + DataFlow::Node endpoint, EndpointType endpointClass, EndpointCharacteristic characteristic |
| 56 | +) { |
| 57 | + characteristic.getEndpoints(endpoint) and |
| 58 | + // Only consider the source code for the project being analyzed. |
| 59 | + exists(endpoint.getFile().getRelativePath()) and |
| 60 | + // Only select endpoints that can be part of a tainted flow: Constant expressions always evaluate to a constant |
| 61 | + // primitive value. Therefore they can't ever appear in an alert, making them less interesting training examples. |
| 62 | + // TODO: Experiment with removing this requirement. |
| 63 | + not endpoint.asExpr() instanceof ConstantExpr and |
| 64 | + // Do not select endpoints filtered out by end-to-end evaluation. |
| 65 | + // TODO: Experiment with removing this requirement. |
| 66 | + not Exclusions::isFileExcluded(endpoint.getFile()) and |
| 67 | + // Filter out negative examples that also have a LikelyNotASinkReason, because this is currently done here |
| 68 | + // https://github.com/github/codeql/blob/387e57546bf7352f7c1cfe781daa1a3799b7063e/javascript/ql/experimental/adaptivethreatmodeling/modelbuilding/extraction/ExtractEndpointData.qll#L77 |
| 69 | + // TODO: Experiment with removing this requirement. |
| 70 | + not ( |
| 71 | + endpointClass instanceof NegativeType and |
| 72 | + exists(EndpointCharacteristic c | |
| 73 | + c.getEndpoints(endpoint) and |
| 74 | + c instanceof LikelyNotASinkCharacteristic |
| 75 | + ) |
| 76 | + ) and |
| 77 | + ( |
| 78 | + // If the list of characteristics includes positive indicators with high confidence for this class, select this as a |
| 79 | + // training sample belonging to the class. |
| 80 | + exists(EndpointCharacteristic characteristic2, float confidence | |
| 81 | + characteristic2.getEndpoints(endpoint) and |
| 82 | + characteristic2.getImplications(endpointClass, true, confidence) and |
| 83 | + confidence >= characteristic2.getHighConfidenceThreshold() |
| 84 | + ) and |
| 85 | + ( |
| 86 | + // Temporarily limit this only to positive classes. For negative classes, additionally select only endpoints that |
| 87 | + // have no high confidence indicators that they are sinks, because this is what was previously done. |
| 88 | + // TODO: Experiment with removing this requirement, and instead ensuring that an endpoint never has both a high |
| 89 | + // confidence indicator that it _is_ a sink and a high confidence indicator that it is _not_ a sink. |
| 90 | + not endpointClass instanceof NegativeType |
| 91 | + or |
| 92 | + not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass | |
| 93 | + characteristic3.getEndpoints(endpoint) and |
| 94 | + characteristic3.getImplications(posClass, true, confidence3) and |
| 95 | + confidence3 >= characteristic3.getHighConfidenceThreshold() and |
| 96 | + not posClass instanceof NegativeType |
| 97 | + ) |
| 98 | + ) |
| 99 | + or |
| 100 | + // If the list of characteristics includes negative indicators with high confidence for all classes other than 0, |
| 101 | + // select this as a training sample of class 0 (this means we had query-specific characteristics to decide this |
| 102 | + // endpoint isn’t a sink for each of our sink types). |
| 103 | + endpointClass instanceof NegativeType and |
| 104 | + forall(EndpointType otherClass | not otherClass instanceof NegativeType | |
| 105 | + exists(EndpointCharacteristic characteristic2, float confidence | |
| 106 | + characteristic2.getEndpoints(endpoint) and |
| 107 | + characteristic2.getImplications(otherClass, false, confidence) and |
| 108 | + confidence >= characteristic2.getHighConfidenceThreshold() |
| 109 | + ) |
| 110 | + ) |
| 111 | + ) |
| 112 | +} |
| 113 | + |
| 114 | +/** |
| 115 | + * Temporary: |
| 116 | + * Reformat the training data that was extracted with the new logic to match the format produced by the old predicate. |
| 117 | + * This is the format expected by the endpoint pipeline. |
| 118 | + */ |
| 119 | +query predicate reformattedTrainingEndpoints( |
| 120 | + DataFlow::Node endpoint, string queryName, string key, string value, string valueType |
| 121 | +) { |
| 122 | + trainingEndpoints(endpoint, _, _) and |
| 123 | + exists(Query query | |
| 124 | + queryName = query.getName() and |
| 125 | + // For sinks, only list that sink type, but for non-sinks, list all sink types. |
| 126 | + ( |
| 127 | + exists(EndpointType endpointClass | |
| 128 | + endpointClass.getDescription().matches(queryName + "%") and |
| 129 | + not endpointClass instanceof NegativeType and |
| 130 | + trainingEndpoints(endpoint, endpointClass, _) |
| 131 | + ) |
| 132 | + or |
| 133 | + exists(EndpointType endpointClass | |
| 134 | + endpointClass instanceof NegativeType and |
| 135 | + trainingEndpoints(endpoint, endpointClass, _) |
| 136 | + ) |
| 137 | + ) and |
| 138 | + ( |
| 139 | + // NOTE: We don't use hasFlowFromSource in training, so we could just hardcode it to be false. |
| 140 | + key = "hasFlowFromSource" and |
| 141 | + ( |
| 142 | + if FlowFromSource::hasFlowFromSource(endpoint, query) |
| 143 | + then value = "true" |
| 144 | + else value = "false" |
| 145 | + ) and |
| 146 | + valueType = "boolean" |
| 147 | + or |
| 148 | + // Constant expressions always evaluate to a constant primitive value. Therefore they can't ever |
| 149 | + // appear in an alert, making them less interesting training examples. |
| 150 | + key = "isConstantExpression" and |
| 151 | + (if endpoint.asExpr() instanceof ConstantExpr then value = "true" else value = "false") and |
| 152 | + valueType = "boolean" |
| 153 | + or |
| 154 | + // Holds if alerts involving the endpoint are excluded from the end-to-end evaluation. |
| 155 | + key = "isExcludedFromEndToEndEvaluation" and |
| 156 | + (if Exclusions::isFileExcluded(endpoint.getFile()) then value = "true" else value = "false") and |
| 157 | + valueType = "boolean" |
| 158 | + or |
| 159 | + // The label for this query, considering the endpoint as a sink. |
| 160 | + key = "sinkLabel" and |
| 161 | + valueType = "string" and |
| 162 | + value = "Sink" and |
| 163 | + exists(EndpointType endpointClass | |
| 164 | + endpointClass.getDescription().matches(queryName + "%") and |
| 165 | + not endpointClass instanceof NegativeType and |
| 166 | + trainingEndpoints(endpoint, endpointClass, _) |
| 167 | + ) |
| 168 | + or |
| 169 | + key = "sinkLabel" and |
| 170 | + valueType = "string" and |
| 171 | + value = "NotASink" and |
| 172 | + exists(EndpointType endpointClass | |
| 173 | + endpointClass instanceof NegativeType and |
| 174 | + trainingEndpoints(endpoint, endpointClass, _) |
| 175 | + ) |
| 176 | + or |
| 177 | + // The reason, or reasons, why the endpoint was labeled NotASink for this query, only for negative examples. |
| 178 | + key = "notASinkReason" and |
| 179 | + exists(EndpointCharacteristic characteristic, EndpointType endpointClass | |
| 180 | + characteristic.getEndpoints(endpoint) and |
| 181 | + characteristic.getImplications(endpointClass, true, _) and |
| 182 | + endpointClass instanceof NegativeType and |
| 183 | + value = characteristic |
| 184 | + ) and |
| 185 | + // Don't include a notASinkReason for endpoints that are also known sinks. |
| 186 | + not exists(EndpointCharacteristic characteristic3, float confidence3, EndpointType posClass | |
| 187 | + characteristic3.getEndpoints(endpoint) and |
| 188 | + characteristic3.getImplications(posClass, true, confidence3) and |
| 189 | + confidence3 >= characteristic3.getHighConfidenceThreshold() and |
| 190 | + not posClass instanceof NegativeType |
| 191 | + ) and |
| 192 | + valueType = "string" |
| 193 | + ) |
| 194 | + ) |
| 195 | +} |
| 196 | + |
| 197 | +/** |
| 198 | + * Gets the ATM data flow configuration for the specified query. |
| 199 | + * TODO: Delete this once we are no longer surfacing `hasFlowFromSource`. |
| 200 | + */ |
| 201 | +DataFlow::Configuration getDataFlowCfg(Query query) { |
| 202 | + query instanceof NosqlInjectionQuery and result instanceof NosqlInjectionAtm::Configuration |
| 203 | + or |
| 204 | + query instanceof SqlInjectionQuery and result instanceof SqlInjectionAtm::Configuration |
| 205 | + or |
| 206 | + query instanceof TaintedPathQuery and result instanceof TaintedPathAtm::Configuration |
| 207 | + or |
| 208 | + query instanceof XssQuery and result instanceof XssAtm::Configuration |
| 209 | +} |
| 210 | + |
| 211 | +// TODO: Delete this once we are no longer surfacing `hasFlowFromSource`. |
| 212 | +module FlowFromSource { |
| 213 | + predicate hasFlowFromSource(DataFlow::Node endpoint, Query q) { |
| 214 | + exists(Configuration cfg | cfg.getQuery() = q | cfg.hasFlow(_, endpoint)) |
| 215 | + } |
| 216 | + |
| 217 | + /** |
| 218 | + * A data flow configuration that replicates the data flow configuration for a specific query, but |
| 219 | + * replaces the set of sinks with the set of endpoints we're extracting. |
| 220 | + * |
| 221 | + * We use this to find out when there is flow to a particular endpoint from a known source. |
| 222 | + * |
| 223 | + * This configuration behaves in a very similar way to the `ForwardExploringConfiguration` class |
| 224 | + * from the CodeQL standard libraries for JavaScript. |
| 225 | + */ |
| 226 | + private class Configuration extends DataFlow::Configuration { |
| 227 | + Query q; |
| 228 | + |
| 229 | + Configuration() { this = getDataFlowCfg(q) } |
| 230 | + |
| 231 | + Query getQuery() { result = q } |
| 232 | + |
| 233 | + /** Holds if `sink` is an endpoint we're extracting. */ |
| 234 | + override predicate isSink(DataFlow::Node sink) { any() } |
| 235 | + |
| 236 | + /** Holds if `sink` is an endpoint we're extracting. */ |
| 237 | + override predicate isSink(DataFlow::Node sink, DataFlow::FlowLabel lbl) { exists(lbl) } |
| 238 | + } |
| 239 | +} |
0 commit comments