Skip to content

Commit 9193de6

Browse files
authored
Merge pull request github#13730 from github/tausbn/limit-number-of-candidates-in-application-mode
Java: Limit the number of samples extracted in application mode
2 parents 31bed36 + 49194a2 commit 9193de6

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

java/ql/src/Telemetry/AutomodelApplicationModeExtractCandidates.ql

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,44 @@
1212
* @tags internal extract automodel application-mode candidates
1313
*/
1414

15+
import java
1516
private import AutomodelApplicationModeCharacteristics
1617
private import AutomodelJavaUtil
1718

19+
/**
20+
* Gets a sample of endpoints (of at most `limit` samples) with the given method signature.
21+
*
22+
* The main purpose of this helper predicate is to avoid selecting too many candidates, as this may
23+
* cause the SARIF file to exceed the maximum size limit.
24+
*/
25+
bindingset[limit]
26+
private Endpoint getSampleForSignature(
27+
int limit, string package, string type, string subtypes, string name, string signature,
28+
string input
29+
) {
30+
exists(int n, int num_endpoints, ApplicationModeMetadataExtractor meta |
31+
num_endpoints =
32+
count(Endpoint e | meta.hasMetadata(e, package, type, subtypes, name, signature, input))
33+
|
34+
result =
35+
rank[n](Endpoint e, Location loc |
36+
loc = e.getLocation() and
37+
meta.hasMetadata(e, package, type, subtypes, name, signature, input)
38+
|
39+
e
40+
order by
41+
loc.getFile().getAbsolutePath(), loc.getStartLine(), loc.getStartColumn(),
42+
loc.getEndLine(), loc.getEndColumn()
43+
) and
44+
// To avoid selecting samples that are too close together (as the ranking above goes by file
45+
// path first), we select `limit` evenly spaced samples from the ranked list of endpoints. By
46+
// default this would always include the first sample, so we add a random-chosen prime offset
47+
// to the first sample index, and reduce modulo the number of endpoints.
48+
// Finally, we add 1 to the result, as ranking results in a 1-indexed relation.
49+
n = 1 + (([0 .. limit - 1] * (num_endpoints / limit).floor() + 46337) % num_endpoints)
50+
)
51+
}
52+
1853
from
1954
Endpoint endpoint, string message, ApplicationModeMetadataExtractor meta, DollarAtString package,
2055
DollarAtString type, DollarAtString subtypes, DollarAtString name, DollarAtString signature,
@@ -23,6 +58,7 @@ where
2358
not exists(CharacteristicsImpl::UninterestingToModelCharacteristic u |
2459
u.appliesToEndpoint(endpoint)
2560
) and
61+
endpoint = getSampleForSignature(9, package, type, subtypes, name, signature, input) and
2662
// If a node is already a known sink for any of our existing ATM queries and is already modeled as a MaD sink, we
2763
// don't include it as a candidate. Otherwise, we might include it as a candidate for query A, but the model will
2864
// label it as a sink for one of the sink types of query B, for which it's already a known sink. This would result in

0 commit comments

Comments
 (0)