Skip to content

Commit f492bb9

Browse files
[Profiling] Add support for variable sampling frequency (#128086)
* [Profiling] Add support for variable sampling frequency * Update x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/TransportGetStackTracesAction.java Co-authored-by: Christos Kalkanis <[email protected]> * Add comments and remove superfluous debug log --------- Co-authored-by: Christos Kalkanis <[email protected]>
1 parent 7fe9931 commit f492bb9

File tree

12 files changed

+197
-82
lines changed

12 files changed

+197
-82
lines changed

x-pack/plugin/core/template-resources/src/main/resources/profiling/component-template/profiling-events.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@
7676
"type": "short",
7777
"index": false
7878
},
79+
"Stacktrace.sampling_frequency": {
80+
"type": "long",
81+
"index": false
82+
},
7983
"agent.version": {
8084
"type": "keyword"
8185
},

x-pack/plugin/profiling/src/internalClusterTest/java/org/elasticsearch/xpack/profiling/action/GetStackTracesActionIT.java

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,13 @@ public void testGetStackTracesUnfiltered() throws Exception {
4040

4141
Map<TraceEventID, TraceEvent> traceEvents = response.getStackTraceEvents();
4242

43-
TraceEventID traceEventID = new TraceEventID("", "497295213074376", "8457605156473051743", "L7kj7UvlKbT-vN73el4faQ");
43+
TraceEventID traceEventID = new TraceEventID(
44+
"",
45+
"497295213074376",
46+
"8457605156473051743",
47+
"L7kj7UvlKbT-vN73el4faQ",
48+
TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY
49+
);
4450
assertEquals(3L, response.getStackTraceEvents().get(traceEventID).count);
4551

4652
assertNotNull(response.getStackTraces());
@@ -84,7 +90,13 @@ public void testGetStackTracesGroupedByServiceName() throws Exception {
8490

8591
assertNotNull(response.getStackTraceEvents());
8692

87-
TraceEventID traceEventID = new TraceEventID("", "497295213074376", "8457605156473051743", "L7kj7UvlKbT-vN73el4faQ");
93+
TraceEventID traceEventID = new TraceEventID(
94+
"",
95+
"497295213074376",
96+
"8457605156473051743",
97+
"L7kj7UvlKbT-vN73el4faQ",
98+
TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY
99+
);
88100
assertEquals(3L, response.getStackTraceEvents().get(traceEventID).count);
89101
assertEquals(Long.valueOf(2L), response.getStackTraceEvents().get(traceEventID).subGroups.getCount("basket"));
90102

@@ -131,11 +143,17 @@ public void testGetStackTracesFromAPMWithMatchNoDownsampling() throws Exception
131143

132144
assertNotNull(response.getStackTraceEvents());
133145

134-
TraceEventID traceEventID = new TraceEventID("", "", "", "Ce77w10WeIDow3kd1jowlA");
146+
TraceEventID traceEventID = new TraceEventID(
147+
"",
148+
"",
149+
"",
150+
"Ce77w10WeIDow3kd1jowlA",
151+
TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY
152+
);
135153
assertEquals(3L, response.getStackTraceEvents().get(traceEventID).count);
136154
assertEquals(Long.valueOf(3L), response.getStackTraceEvents().get(traceEventID).subGroups.getCount("encodeSha1"));
137155

138-
traceEventID = new TraceEventID("", "", "", "JvISdnJ47BQ01489cwF9DA");
156+
traceEventID = new TraceEventID("", "", "", "JvISdnJ47BQ01489cwF9DA", TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY);
139157
assertEquals(2L, response.getStackTraceEvents().get(traceEventID).count);
140158

141159
assertNotNull(response.getStackTraces());
@@ -182,10 +200,16 @@ public void testGetStackTracesFromAPMWithMatchAndDownsampling() throws Exception
182200
assertNotNull(response.getStackTraceEvents());
183201

184202
// as the sampling rate is 0.2, we see 5 times more samples (random sampler agg automatically adjusts sample count)
185-
TraceEventID traceEventID = new TraceEventID("", "", "", "Ce77w10WeIDow3kd1jowlA");
203+
TraceEventID traceEventID = new TraceEventID(
204+
"",
205+
"",
206+
"",
207+
"Ce77w10WeIDow3kd1jowlA",
208+
TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY
209+
);
186210
assertEquals(5 * 3L, response.getStackTraceEvents().get(traceEventID).count);
187211

188-
traceEventID = new TraceEventID("", "", "", "JvISdnJ47BQ01489cwF9DA");
212+
traceEventID = new TraceEventID("", "", "", "JvISdnJ47BQ01489cwF9DA", TransportGetStackTracesAction.DEFAULT_SAMPLING_FREQUENCY);
189213
assertEquals(5 * 2L, response.getStackTraceEvents().get(traceEventID).count);
190214

191215
assertNotNull(response.getStackTraces());

x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/CO2Calculator.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import java.util.Map;
1313

1414
final class CO2Calculator {
15-
private static final double DEFAULT_SAMPLING_FREQUENCY = 19.0d;
1615
private static final double DEFAULT_CO2_TONS_PER_KWH = 0.000379069d; // unit: metric tons / kWh
1716
private static final double DEFAULT_KILOWATTS_PER_CORE_X86 = 7.0d / 1000.0d; // unit: watt / core
1817
private static final double DEFAULT_KILOWATTS_PER_CORE_ARM64 = 2.8d / 1000.0d; // unit: watt / core
@@ -43,8 +42,8 @@ final class CO2Calculator {
4342
: customPerCoreWattARM64 / 1000.0d;
4443
}
4544

46-
public double getAnnualCO2Tons(String hostID, long samples) {
47-
double annualCoreHours = CostCalculator.annualCoreHours(samplingDurationInSeconds, samples, DEFAULT_SAMPLING_FREQUENCY);
45+
public double getAnnualCO2Tons(String hostID, long samples, double samplingFrequency) {
46+
double annualCoreHours = CostCalculator.annualCoreHours(samplingDurationInSeconds, samples, samplingFrequency);
4847

4948
HostMetadata host = hostMetadata.get(hostID);
5049
if (host == null) {

x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/CostCalculator.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import java.util.Map;
1111

1212
final class CostCalculator {
13-
private static final double DEFAULT_SAMPLING_FREQUENCY = 19.0d;
1413
private static final double SECONDS_PER_HOUR = 60 * 60;
1514
private static final double SECONDS_PER_YEAR = SECONDS_PER_HOUR * 24 * 365.0d; // unit: seconds
1615
public static final double DEFAULT_COST_USD_PER_CORE_HOUR = 0.0425d; // unit: USD / (core * hour)
@@ -40,8 +39,8 @@ final class CostCalculator {
4039
);
4140
}
4241

43-
public double annualCostsUSD(String hostID, double samples) {
44-
double annualCoreHours = annualCoreHours(samplingDurationInSeconds, samples, DEFAULT_SAMPLING_FREQUENCY);
42+
public double annualCostsUSD(String hostID, double samples, double samplingFrequency) {
43+
double annualCoreHours = annualCoreHours(samplingDurationInSeconds, samples, samplingFrequency);
4544

4645
HostMetadata host = hostMetadata.get(hostID);
4746
if (host == null) {
@@ -59,7 +58,6 @@ public double annualCostsUSD(String hostID, double samples) {
5958
}
6059

6160
public static double annualCoreHours(double duration, double samples, double samplingFrequency) {
62-
// samplingFrequency will a variable value when we start supporting probabilistic profiling (soon).
6361
return (SECONDS_PER_YEAR / duration * samples / samplingFrequency) / SECONDS_PER_HOUR; // unit: core * hour
6462
}
6563
}

x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/Resampler.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class Resampler {
1414
private final boolean requiresResampling;
1515
private final RandomGenerator r;
1616
private final double adjustedSampleRate;
17-
private final double p;
17+
public final double p;
1818

1919
Resampler(GetStackTracesRequest request, double sampleRate, long totalCount) {
2020
// Manually reduce sample count if totalCount exceeds sampleSize by 10%.
@@ -50,7 +50,7 @@ public int adjustSampleCount(int originalCount) {
5050
}
5151
// Adjust the sample counts from down-sampled to fully sampled.
5252
// Be aware that downsampling drops entries from stackTraceEvents, so that
53-
// the sum of the upscaled count values is less that totalCount.
54-
return (int) Math.floor(rawCount / (p * adjustedSampleRate));
53+
// the sum of the upscaled count values is less than totalCount.
54+
return (int) Math.round(rawCount / (p * adjustedSampleRate));
5555
}
5656
}

x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/TraceEventID.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77

88
package org.elasticsearch.xpack.profiling.action;
99

10-
record TraceEventID(String executableName, String threadName, String hostID, String stacktraceID) {}
10+
record TraceEventID(String executableName, String threadName, String hostID, String stacktraceID, double samplingFrequency) {}

x-pack/plugin/profiling/src/main/java/org/elasticsearch/xpack/profiling/action/TransportGetStackTracesAction.java

Lines changed: 87 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,12 @@ public class TransportGetStackTracesAction extends TransportAction<GetStackTrace
115115
*/
116116
private static final String CUSTOM_EVENT_SUB_AGGREGATION_NAME = "custom_event_group";
117117

118+
/**
119+
* This is the default sampling rate for profiling events that we use if no sampling rate is
120+
* stored in the backend (backwards compatibility).
121+
*/
122+
public static final double DEFAULT_SAMPLING_FREQUENCY = 19.0d;
123+
118124
private final NodeClient nodeClient;
119125
private final ProfilingLicenseChecker licenseChecker;
120126
private final ClusterService clusterService;
@@ -249,7 +255,6 @@ private void searchGenericEventGroupedByStackTrace(
249255
ActionListener<GetStackTracesResponse> submitListener,
250256
GetStackTracesResponseBuilder responseBuilder
251257
) {
252-
253258
CountedTermsAggregationBuilder groupByStackTraceId = new CountedTermsAggregationBuilder("group_by").size(
254259
MAX_TRACE_EVENTS_RESULT_SIZE
255260
).field(request.getStackTraceIdsField());
@@ -286,7 +291,7 @@ private void searchGenericEventGroupedByStackTrace(
286291

287292
String stackTraceID = stacktraceBucket.getKeyAsString();
288293

289-
TraceEventID eventID = new TraceEventID("", "", "", stackTraceID);
294+
TraceEventID eventID = new TraceEventID("", "", "", stackTraceID, DEFAULT_SAMPLING_FREQUENCY);
290295
TraceEvent event = stackTraceEvents.computeIfAbsent(eventID, k -> new TraceEvent());
291296
event.count += count;
292297
subGroups.collectResults(stacktraceBucket, event);
@@ -337,6 +342,16 @@ private void searchEventGroupedByStackTrace(
337342
// Especially with high cardinality fields, this makes aggregations really slow.
338343
.executionHint("map")
339344
.subAggregation(groupByHostId);
345+
TermsAggregationBuilder groupByExecutableName = new TermsAggregationBuilder("group_by")
346+
// 'size' specifies the max number of host IDs we support per request.
347+
.size(MAX_TRACE_EVENTS_RESULT_SIZE)
348+
.field("process.executable.name")
349+
// missing("") is used to include documents where the field is missing.
350+
.missing("")
351+
// 'execution_hint: map' skips the slow building of ordinals that we don't need.
352+
// Especially with high cardinality fields, this makes aggregations really slow.
353+
.executionHint("map")
354+
.subAggregation(groupByThreadName);
340355
SubGroupCollector subGroups = SubGroupCollector.attach(groupByStackTraceId, request.getAggregationFields());
341356
client.prepareSearch(eventsIndex.getName())
342357
.setTrackTotalHits(false)
@@ -351,53 +366,89 @@ private void searchEventGroupedByStackTrace(
351366
new TermsAggregationBuilder("group_by")
352367
// 'size' specifies the max number of host ID we support per request.
353368
.size(MAX_TRACE_EVENTS_RESULT_SIZE)
354-
.field("process.executable.name")
355-
// missing("") is used to include documents where the field is missing.
356-
.missing("")
369+
.field("Stacktrace.sampling_frequency")
370+
// missing(DEFAULT_SAMPLING_RATE) is used to include documents where the field is missing.
371+
.missing((long) DEFAULT_SAMPLING_FREQUENCY)
357372
// 'execution_hint: map' skips the slow building of ordinals that we don't need.
358373
// Especially with high cardinality fields, this makes aggregations really slow.
359374
.executionHint("map")
360-
.subAggregation(groupByThreadName)
375+
.subAggregation(groupByExecutableName)
376+
.subAggregation(new SumAggregationBuilder("total_count").field("Stacktrace.count"))
361377
)
362378
.addAggregation(new SumAggregationBuilder("total_count").field("Stacktrace.count"))
363379
.execute(handleEventsGroupedByStackTrace(submitTask, client, responseBuilder, submitListener, searchResponse -> {
364-
long totalCount = getAggValueAsLong(searchResponse, "total_count");
380+
// The count values for events are scaled up to the highest sampling frequency.
381+
// For example, if the highest sampling frequency is 100, an event with frequency=20 and count=1
382+
// will be upscaled to count=5 (100/20 * count).
383+
// For this, we need to find the highest frequency in the result set.
384+
long maxSamplingFrequency = 0;
385+
Terms samplingFrequencies = searchResponse.getAggregations().get("group_by");
386+
for (Terms.Bucket samplingFrequencyBucket : samplingFrequencies.getBuckets()) {
387+
final double samplingFrequency = samplingFrequencyBucket.getKeyAsNumber().doubleValue();
388+
if (samplingFrequency > maxSamplingFrequency) {
389+
maxSamplingFrequency = (long) samplingFrequency;
390+
}
391+
}
392+
393+
// Calculate a scaled-up total count (scaled up to the highest sampling frequency).
394+
long totalCount = 0;
395+
for (Terms.Bucket samplingFrequencyBucket : samplingFrequencies.getBuckets()) {
396+
InternalNumericMetricsAggregation.SingleValue count = samplingFrequencyBucket.getAggregations().get("total_count");
397+
final double samplingFrequency = samplingFrequencyBucket.getKeyAsNumber().doubleValue();
398+
final double samplingFactor = maxSamplingFrequency / samplingFrequency;
399+
totalCount += Math.round(count.value() * samplingFactor);
400+
}
365401

366402
Resampler resampler = new Resampler(request, responseBuilder.getSamplingRate(), totalCount);
367403

368404
// Sort items lexicographically to access Lucene's term dictionary more efficiently when issuing an mget request.
369-
// The term dictionary is lexicographically sorted and using the same order reduces the number of page faults
405+
// The term dictionary is lexicographically sorted, and using the same order reduces the number of page faults
370406
// needed to load it.
371407
long totalFinalCount = 0;
372408
Map<TraceEventID, TraceEvent> stackTraceEvents = new HashMap<>(MAX_TRACE_EVENTS_RESULT_SIZE);
373409

374-
Terms executableNames = searchResponse.getAggregations().get("group_by");
375-
for (Terms.Bucket executableBucket : executableNames.getBuckets()) {
376-
String executableName = executableBucket.getKeyAsString();
377-
378-
Terms threads = executableBucket.getAggregations().get("group_by");
379-
for (Terms.Bucket threadBucket : threads.getBuckets()) {
380-
String threadName = threadBucket.getKeyAsString();
381-
382-
Terms hosts = threadBucket.getAggregations().get("group_by");
383-
for (Terms.Bucket hostBucket : hosts.getBuckets()) {
384-
String hostID = hostBucket.getKeyAsString();
385-
386-
Terms stacktraces = hostBucket.getAggregations().get("group_by");
387-
for (Terms.Bucket stacktraceBucket : stacktraces.getBuckets()) {
388-
Sum count = stacktraceBucket.getAggregations().get("count");
389-
int finalCount = resampler.adjustSampleCount((int) count.value());
390-
if (finalCount <= 0) {
391-
continue;
410+
// Walk over all nested aggregations.
411+
// The outermost aggregation is the sampling frequency.
412+
// The next level is the executable name, followed by the thread name, host ID and stacktrace ID.
413+
// the innermost aggregation contains the count of samples for each stacktrace ID.
414+
for (Terms.Bucket samplingFrequencyBucket : samplingFrequencies.getBuckets()) {
415+
final double samplingFrequency = samplingFrequencyBucket.getKeyAsNumber().doubleValue();
416+
final double samplingFactor = maxSamplingFrequency / samplingFrequency;
417+
418+
Terms executableNames = samplingFrequencyBucket.getAggregations().get("group_by");
419+
for (Terms.Bucket executableBucket : executableNames.getBuckets()) {
420+
String executableName = executableBucket.getKeyAsString();
421+
422+
Terms threads = executableBucket.getAggregations().get("group_by");
423+
for (Terms.Bucket threadBucket : threads.getBuckets()) {
424+
String threadName = threadBucket.getKeyAsString();
425+
426+
Terms hosts = threadBucket.getAggregations().get("group_by");
427+
for (Terms.Bucket hostBucket : hosts.getBuckets()) {
428+
String hostID = hostBucket.getKeyAsString();
429+
430+
Terms stacktraces = hostBucket.getAggregations().get("group_by");
431+
for (Terms.Bucket stacktraceBucket : stacktraces.getBuckets()) {
432+
Sum count = stacktraceBucket.getAggregations().get("count");
433+
int finalCount = resampler.adjustSampleCount((int) Math.round(count.value() * samplingFactor));
434+
if (finalCount <= 0) {
435+
continue;
436+
}
437+
438+
totalFinalCount += finalCount;
439+
440+
String stackTraceID = stacktraceBucket.getKeyAsString();
441+
TraceEventID eventID = new TraceEventID(
442+
executableName,
443+
threadName,
444+
hostID,
445+
stackTraceID,
446+
maxSamplingFrequency
447+
);
448+
TraceEvent event = stackTraceEvents.computeIfAbsent(eventID, k -> new TraceEvent());
449+
event.count += finalCount;
450+
subGroups.collectResults(stacktraceBucket, event);
392451
}
393-
totalFinalCount += finalCount;
394-
395-
String stackTraceID = stacktraceBucket.getKeyAsString();
396-
397-
TraceEventID eventID = new TraceEventID(executableName, threadName, hostID, stackTraceID);
398-
TraceEvent event = stackTraceEvents.computeIfAbsent(eventID, k -> new TraceEvent());
399-
event.count += finalCount;
400-
subGroups.collectResults(stacktraceBucket, event);
401452
}
402453
}
403454
}
@@ -629,8 +680,8 @@ public void calculateCO2AndCosts() {
629680
);
630681

631682
responseBuilder.getStackTraceEvents().forEach((eventId, event) -> {
632-
event.annualCO2Tons += co2Calculator.getAnnualCO2Tons(eventId.hostID(), event.count);
633-
event.annualCostsUSD += costCalculator.annualCostsUSD(eventId.hostID(), event.count);
683+
event.annualCO2Tons += co2Calculator.getAnnualCO2Tons(eventId.hostID(), event.count, eventId.samplingFrequency());
684+
event.annualCostsUSD += costCalculator.annualCostsUSD(eventId.hostID(), event.count, eventId.samplingFrequency());
634685
});
635686

636687
log.debug(watch::report);

0 commit comments

Comments
 (0)