Skip to content

Commit 209f3e2

Browse files
authored
Merge pull request github#13239 from github/tausbn/automodel-application-mode
Java: Add QL support for automodel application mode
2 parents 1a4fca3 + b38bc52 commit 209f3e2

11 files changed

+779
-140
lines changed

java/ql/src/Telemetry/AutomodelApplicationModeCharacteristics.qll

Lines changed: 444 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/**
2+
* Surfaces the endpoints that are not already known to be sinks, and are therefore used as candidates for
3+
* classification with an ML model.
4+
*
5+
* Note: This query does not actually classify the endpoints using the model.
6+
*
7+
* @name Automodel candidates (application mode)
8+
* @description A query to extract automodel candidates in application mode.
9+
* @kind problem
10+
* @problem.severity recommendation
11+
* @id java/ml/extract-automodel-application-candidates
12+
* @tags internal extract automodel application-mode candidates
13+
*/
14+
15+
private import AutomodelApplicationModeCharacteristics
16+
private import AutomodelJavaUtil
17+
18+
from
19+
Endpoint endpoint, string message, ApplicationModeMetadataExtractor meta, DollarAtString package,
20+
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
21+
DollarAtString input
22+
where
23+
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
24+
u.appliesToEndpoint(endpoint)
25+
) and
26+
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
27+
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
28+
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in
29+
// overlap between our detected sinks and the pre-existing modeling. We assume that, if a sink has already been
30+
// modeled in a MaD model, then it doesn't belong to any additional sink types, and we don't need to reexamine it.
31+
not CharacteristicsImpl::isSink(endpoint, _) and
32+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
33+
// The message is the concatenation of all sink types for which this endpoint is known neither to be a sink nor to be
34+
// a non-sink, and we surface only endpoints that have at least one such sink type.
35+
message =
36+
strictconcat(AutomodelEndpointTypes::SinkType sinkType |
37+
not CharacteristicsImpl::isKnownSink(endpoint, sinkType) and
38+
CharacteristicsImpl::isSinkCandidate(endpoint, sinkType)
39+
|
40+
sinkType, ", "
41+
)
42+
select endpoint, message + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
43+
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
44+
package, "package", //
45+
type, "type", //
46+
subtypes, "subtypes", //
47+
name, "name", // method name
48+
signature, "signature", //
49+
input, "input" //
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/**
2+
* Surfaces endpoints that are non-sinks with high confidence, for use as negative examples in the prompt.
3+
*
4+
* @name Negative examples (application mode)
5+
* @kind problem
6+
* @problem.severity recommendation
7+
* @id java/ml/extract-automodel-application-negative-examples
8+
* @tags internal extract automodel application-mode negative examples
9+
*/
10+
11+
private import java
12+
private import AutomodelApplicationModeCharacteristics
13+
private import AutomodelEndpointTypes
14+
private import AutomodelJavaUtil
15+
16+
/**
17+
* Gets a sample of endpoints (of at most `limit` samples) for which the given characteristic applies.
18+
*
19+
* The main purpose of this helper predicate is to avoid selecting too many samples, as this may
20+
* cause the SARIF file to exceed the maximum size limit.
21+
*/
22+
bindingset[limit]
23+
Endpoint getSampleForCharacteristic(EndpointCharacteristic c, int limit) {
24+
exists(int n, int num_endpoints | num_endpoints = count(Endpoint e | c.appliesToEndpoint(e)) |
25+
result =
26+
rank[n](Endpoint e, Location loc |
27+
loc = e.getLocation() and c.appliesToEndpoint(e)
28+
|
29+
e
30+
order by
31+
loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(),
32+
loc.getEndLine(), loc.getEndColumn()
33+
) and
34+
// To avoid selecting samples that are too close together (as the ranking above goes by file
35+
// path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
36+
// default this would always include the first sample, so we add a random-chosen prime offset
37+
// to the first sample index, and reduce modulo the number of endpoints.
38+
// Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
39+
n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints)
40+
)
41+
}
42+
43+
from
44+
Endpoint endpoint, EndpointCharacteristic characteristic, float confidence, string message,
45+
ApplicationModeMetadataExtractor meta, DollarAtString package, DollarAtString type,
46+
DollarAtString subtypes, DollarAtString name, DollarAtString signature, DollarAtString input
47+
where
48+
endpoint = getSampleForCharacteristic(characteristic, 100) and
49+
confidence >= SharedCharacteristics::highConfidence() and
50+
characteristic.hasImplications(any(NegativeSinkType negative), true, confidence) and
51+
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
52+
// certain about in the prompt.
53+
not erroneousEndpoints(endpoint, _, _, _, _, false) and
54+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
55+
// It's valid for a node to satisfy the logic for both `isSink` and `isSanitizer`, but in that case it will be
56+
// treated by the actual query as a sanitizer, since the final logic is something like
57+
// `isSink(n) and not isSanitizer(n)`. We don't want to include such nodes as negative examples in the prompt, because
58+
// they're ambiguous and might confuse the model, so we explicitly exclude all known sinks from the negative examples.
59+
not exists(EndpointCharacteristic characteristic2, float confidence2, SinkType positiveType |
60+
not positiveType instanceof NegativeSinkType and
61+
characteristic2.appliesToEndpoint(endpoint) and
62+
confidence2 >= SharedCharacteristics::maximalConfidence() and
63+
characteristic2.hasImplications(positiveType, true, confidence2)
64+
) and
65+
message = characteristic
66+
select endpoint, message + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
67+
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
68+
package, "package", //
69+
type, "type", //
70+
subtypes, "subtypes", //
71+
name, "name", //
72+
signature, "signature", //
73+
input, "input" //
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/**
2+
* Surfaces endpoints that are sinks with high confidence, for use as positive examples in the prompt.
3+
*
4+
* @name Positive examples (application mode)
5+
* @kind problem
6+
* @problem.severity recommendation
7+
* @id java/ml/extract-automodel-application-positive-examples
8+
* @tags internal extract automodel application-mode positive examples
9+
*/
10+
11+
private import AutomodelApplicationModeCharacteristics
12+
private import AutomodelEndpointTypes
13+
private import AutomodelJavaUtil
14+
15+
from
16+
Endpoint endpoint, SinkType sinkType, ApplicationModeMetadataExtractor meta,
17+
DollarAtString package, DollarAtString type, DollarAtString subtypes, DollarAtString name,
18+
DollarAtString signature, DollarAtString input
19+
where
20+
// Exclude endpoints that have contradictory endpoint characteristics, because we only want examples we're highly
21+
// certain about in the prompt.
22+
not erroneousEndpoints(endpoint, _, _, _, _, false) and
23+
meta.hasMetadata(endpoint, package, type, subtypes, name, signature, input) and
24+
// Extract positive examples of sinks belonging to the existing ATM query configurations.
25+
CharacteristicsImpl::isKnownSink(endpoint, sinkType)
26+
select endpoint, sinkType + "\nrelated locations: $@." + "\nmetadata: $@, $@, $@, $@, $@, $@.", //
27+
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, CallContext()), "CallContext", //
28+
package, "package", //
29+
type, "type", //
30+
subtypes, "subtypes", //
31+
name, "name", //
32+
signature, "signature", //
33+
input, "input" //

java/ql/src/Telemetry/AutomodelFrameworkModeCharacteristics.qll

Lines changed: 31 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,11 @@ private import semmle.code.java.Expr as Expr
1414
private import semmle.code.java.security.QueryInjection
1515
private import semmle.code.java.security.RequestForgery
1616
private import semmle.code.java.dataflow.internal.ModelExclusions as ModelExclusions
17+
private import AutomodelJavaUtil as AutomodelJavaUtil
18+
private import AutomodelSharedGetCallable as AutomodelSharedGetCallable
1719
import AutomodelSharedCharacteristics as SharedCharacteristics
1820
import AutomodelEndpointTypes as AutomodelEndpointTypes
1921

20-
/**
21-
* A meta data extractor. Any Java extraction mode needs to implement exactly
22-
* one instance of this class.
23-
*/
24-
abstract class MetadataExtractor extends string {
25-
bindingset[this]
26-
MetadataExtractor() { any() }
27-
28-
abstract predicate hasMetadata(
29-
DataFlow::ParameterNode e, string package, string type, boolean subtypes, string name,
30-
string signature, int input, string parameterName
31-
);
32-
}
33-
3422
newtype JavaRelatedLocationType =
3523
MethodDoc() or
3624
ClassDoc()
@@ -60,31 +48,7 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
6048

6149
RelatedLocation asLocation(Endpoint e) { result = e.asParameter() }
6250

63-
predicate isKnownKind(string kind, string humanReadableKind, EndpointType type) {
64-
kind = "read-file" and
65-
humanReadableKind = "read file" and
66-
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
67-
or
68-
kind = "create-file" and
69-
humanReadableKind = "create file" and
70-
type instanceof AutomodelEndpointTypes::TaintedPathSinkType
71-
or
72-
kind = "sql" and
73-
humanReadableKind = "mad modeled sql" and
74-
type instanceof AutomodelEndpointTypes::SqlSinkType
75-
or
76-
kind = "open-url" and
77-
humanReadableKind = "open url" and
78-
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
79-
or
80-
kind = "jdbc-url" and
81-
humanReadableKind = "jdbc url" and
82-
type instanceof AutomodelEndpointTypes::RequestForgerySinkType
83-
or
84-
kind = "command-injection" and
85-
humanReadableKind = "command injection" and
86-
type instanceof AutomodelEndpointTypes::CommandInjectionSinkType
87-
}
51+
predicate isKnownKind = AutomodelJavaUtil::isKnownKind/3;
8852

8953
predicate isSink(Endpoint e, string kind) {
9054
exists(string package, string type, string name, string signature, string ext, string input |
@@ -103,33 +67,41 @@ module FrameworkCandidatesImpl implements SharedCharacteristics::CandidateSig {
10367
additional predicate sinkSpec(
10468
Endpoint e, string package, string type, string name, string signature, string ext, string input
10569
) {
106-
FrameworkCandidatesImpl::getCallable(e).hasQualifiedName(package, type, name) and
107-
signature = ExternalFlow::paramsString(getCallable(e)) and
70+
FrameworkModeGetCallable::getCallable(e).hasQualifiedName(package, type, name) and
71+
signature = ExternalFlow::paramsString(FrameworkModeGetCallable::getCallable(e)) and
10872
ext = "" and
10973
exists(int paramIdx | e.isParameterOf(_, paramIdx) |
110-
if paramIdx = -1 then input = "Argument[this]" else input = "Argument[" + paramIdx + "]"
74+
input = AutomodelJavaUtil::getArgumentForIndex(paramIdx)
11175
)
11276
}
11377

11478
/**
115-
* Returns the related location for the given endpoint.
79+
* Gets the related location for the given endpoint.
11680
*
11781
* Related locations can be JavaDoc comments of the class or the method.
11882
*/
11983
RelatedLocation getRelatedLocation(Endpoint e, RelatedLocationType type) {
12084
type = MethodDoc() and
121-
result = FrameworkCandidatesImpl::getCallable(e).(Documentable).getJavadoc()
85+
result = FrameworkModeGetCallable::getCallable(e).(Documentable).getJavadoc()
12286
or
12387
type = ClassDoc() and
124-
result = FrameworkCandidatesImpl::getCallable(e).getDeclaringType().(Documentable).getJavadoc()
88+
result = FrameworkModeGetCallable::getCallable(e).getDeclaringType().(Documentable).getJavadoc()
12589
}
90+
}
91+
92+
private class JavaCallable = Callable;
93+
94+
private module FrameworkModeGetCallable implements AutomodelSharedGetCallable::GetCallableSig {
95+
class Callable = JavaCallable;
96+
97+
class Endpoint = FrameworkCandidatesImpl::Endpoint;
12698

12799
/**
128100
* Returns the callable that contains the given endpoint.
129101
*
130102
* Each Java mode should implement this predicate.
131103
*/
132-
additional Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
104+
Callable getCallable(Endpoint e) { result = e.getEnclosingCallable() }
133105
}
134106

135107
module CharacteristicsImpl = SharedCharacteristics::SharedCharacteristics<FrameworkCandidatesImpl>;
@@ -145,35 +117,19 @@ class Endpoint = FrameworkCandidatesImpl::Endpoint;
145117
/**
146118
* A MetadataExtractor that extracts metadata for framework mode.
147119
*/
148-
class FrameworkModeMetadataExtractor extends MetadataExtractor {
120+
class FrameworkModeMetadataExtractor extends string {
149121
FrameworkModeMetadataExtractor() { this = "FrameworkModeMetadataExtractor" }
150122

151-
/**
152-
* By convention, the subtypes property of the MaD declaration should only be
153-
* true when there _can_ exist any subtypes with a different implementation.
154-
*
155-
* It would technically be ok to always use the value 'true', but this would
156-
* break convention.
157-
*/
158-
boolean considerSubtypes(Callable callable) {
159-
if
160-
callable.isStatic() or
161-
callable.getDeclaringType().isStatic() or
162-
callable.isFinal() or
163-
callable.getDeclaringType().isFinal()
164-
then result = false
165-
else result = true
166-
}
167-
168-
override predicate hasMetadata(
169-
Endpoint e, string package, string type, boolean subtypes, string name, string signature,
170-
int input, string parameterName
123+
predicate hasMetadata(
124+
Endpoint e, string package, string type, string subtypes, string name, string signature,
125+
string input, string parameterName
171126
) {
172-
exists(Callable callable |
173-
e.asParameter() = callable.getParameter(input) and
127+
exists(Callable callable, int paramIdx |
128+
e.asParameter() = callable.getParameter(paramIdx) and
129+
input = AutomodelJavaUtil::getArgumentForIndex(paramIdx) and
174130
package = callable.getDeclaringType().getPackage().getName() and
175131
type = callable.getDeclaringType().getErasure().(RefType).nestedName() and
176-
subtypes = this.considerSubtypes(callable) and
132+
subtypes = AutomodelJavaUtil::considerSubtypes(callable).toString() and
177133
name = callable.getName() and
178134
parameterName = e.asParameter().getName() and
179135
signature = ExternalFlow::paramsString(callable)
@@ -199,8 +155,8 @@ private class UnexploitableIsCharacteristic extends CharacteristicsImpl::NotASin
199155

200156
override predicate appliesToEndpoint(Endpoint e) {
201157
not FrameworkCandidatesImpl::isSink(e, _) and
202-
FrameworkCandidatesImpl::getCallable(e).getName().matches("is%") and
203-
FrameworkCandidatesImpl::getCallable(e).getReturnType() instanceof BooleanType
158+
FrameworkModeGetCallable::getCallable(e).getName().matches("is%") and
159+
FrameworkModeGetCallable::getCallable(e).getReturnType() instanceof BooleanType
204160
}
205161
}
206162

@@ -218,7 +174,7 @@ private class UnexploitableExistsCharacteristic extends CharacteristicsImpl::Not
218174
override predicate appliesToEndpoint(Endpoint e) {
219175
not FrameworkCandidatesImpl::isSink(e, _) and
220176
exists(Callable callable |
221-
callable = FrameworkCandidatesImpl::getCallable(e) and
177+
callable = FrameworkModeGetCallable::getCallable(e) and
222178
callable.getName().toLowerCase() = ["exists", "notexists"] and
223179
callable.getReturnType() instanceof BooleanType
224180
)
@@ -232,7 +188,7 @@ private class ExceptionCharacteristic extends CharacteristicsImpl::NotASinkChara
232188
ExceptionCharacteristic() { this = "exception" }
233189

234190
override predicate appliesToEndpoint(Endpoint e) {
235-
FrameworkCandidatesImpl::getCallable(e).getDeclaringType().getASupertype*() instanceof
191+
FrameworkModeGetCallable::getCallable(e).getDeclaringType().getASupertype*() instanceof
236192
TypeThrowable
237193
}
238194
}
@@ -258,7 +214,7 @@ private class NonPublicMethodCharacteristic extends CharacteristicsImpl::Uninter
258214
NonPublicMethodCharacteristic() { this = "non-public method" }
259215

260216
override predicate appliesToEndpoint(Endpoint e) {
261-
not FrameworkCandidatesImpl::getCallable(e).isPublic()
217+
not FrameworkModeGetCallable::getCallable(e).isPublic()
262218
}
263219
}
264220

java/ql/src/Telemetry/AutomodelFrameworkModeExtractCandidates.ql

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,21 @@
44
*
55
* Note: This query does not actually classify the endpoints using the model.
66
*
7-
* @name Automodel candidates
8-
* @description A query to extract automodel candidates.
7+
* @name Automodel candidates (framework mode)
8+
* @description A query to extract automodel candidates in framework mode.
99
* @kind problem
10-
* @severity info
11-
* @id java/ml/extract-automodel-candidates
12-
* @tags internal automodel extract candidates
10+
* @problem.severity recommendation
11+
* @id java/ml/extract-automodel-framework-candidates
12+
* @tags internal extract automodel framework-mode candidates
1313
*/
1414

1515
private import AutomodelFrameworkModeCharacteristics
16-
private import AutomodelSharedUtil
16+
private import AutomodelJavaUtil
1717

1818
from
19-
Endpoint endpoint, string message, MetadataExtractor meta, string package, string type,
20-
boolean subtypes, string name, string signature, int input, string parameterName
19+
Endpoint endpoint, string message, FrameworkModeMetadataExtractor meta, DollarAtString package,
20+
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
21+
DollarAtString input, DollarAtString parameterName
2122
where
2223
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
2324
u.appliesToEndpoint(endpoint)
@@ -42,10 +43,10 @@ select endpoint,
4243
message + "\nrelated locations: $@, $@." + "\nmetadata: $@, $@, $@, $@, $@, $@, $@.", //
4344
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, MethodDoc()), "MethodDoc", //
4445
CharacteristicsImpl::getRelatedLocationOrCandidate(endpoint, ClassDoc()), "ClassDoc", //
45-
package.(DollarAtString), "package", //
46-
type.(DollarAtString), "type", //
47-
subtypes.toString().(DollarAtString), "subtypes", //
48-
name.(DollarAtString), "name", //
49-
signature.(DollarAtString), "signature", //
50-
input.toString().(DollarAtString), "input", //
51-
parameterName.(DollarAtString), "parameterName" //
46+
package, "package", //
47+
type, "type", //
48+
subtypes, "subtypes", //
49+
name, "name", //
50+
signature, "signature", //
51+
input, "input", //
52+
parameterName, "parameterName" //

0 commit comments

Comments
 (0)