Skip to content

Commit 7b8238e

Browse files
bric3amarziali
andauthored
Add peer tags, span kind and trace root flag to MetricKey bucket (#9178)
* chore(css): Add peer tags, span kind and trace root flag to MetricKey bucket * chore(css): Use a map to create the metric key Also delays the creation of the Utf8ByteString at serialization time. Note that `writer.writeUTF8` emits a header corresponding to the length of the value being written, calling this method again will count as an entry in the array. A possible idea was to use a cache to store the computation of the peerTags, however `Utf8ByteString` is not concatenable / appendable, which is necessary to have the proper encoding. Creating a new "temporary" `Utf8ByteString`, was replaced by a direct call to `String: :getBytes`. * Improves CSS peer tag aggregation (#9336) * Add jmh for metrics aggregation * Cache peer tags to avoid too many strings/utf8 conversions * Use span kind cache * Fix tests * Hardcode eligible span kind since agent backpropagated are deprecated * revisit peer tags aggregation rules according to the rfc * IsTraceRoot is a tristate * Don't confuse trace root with top levels * Fix build after rebase --------- Co-authored-by: Andrea Marziali <[email protected]>
1 parent 69196fa commit 7b8238e

File tree

12 files changed

+621
-97
lines changed

12 files changed

+621
-97
lines changed

communication/src/main/java/datadog/communication/ddagent/DDAgentFeaturesDiscovery.java

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ public class DDAgentFeaturesDiscovery implements DroppingPolicy {
9292
private volatile String version;
9393
private volatile String telemetryProxyEndpoint;
9494
private volatile Set<String> peerTags = emptySet();
95-
private volatile Set<String> spanKindsToComputedStats = emptySet();
9695

9796
private long lastTimeDiscovered;
9897

@@ -127,7 +126,6 @@ private void reset() {
127126
lastTimeDiscovered = 0;
128127
telemetryProxyEndpoint = null;
129128
peerTags = emptySet();
130-
spanKindsToComputedStats = emptySet();
131129
}
132130

133131
/** Run feature discovery, unconditionally. */
@@ -310,12 +308,6 @@ private boolean processInfoResponse(String response) {
310308
peer_tags instanceof List
311309
? unmodifiableSet(new HashSet<>((List<String>) peer_tags))
312310
: emptySet();
313-
314-
Object span_kinds = map.get("span_kinds_stats_computed");
315-
spanKindsToComputedStats =
316-
span_kinds instanceof List
317-
? unmodifiableSet(new HashSet<>((List<String>) span_kinds))
318-
: emptySet();
319311
}
320312
try {
321313
state = Strings.sha256(response);
@@ -377,10 +369,6 @@ public Set<String> peerTags() {
377369
return peerTags;
378370
}
379371

380-
public Set<String> spanKindsToComputedStats() {
381-
return spanKindsToComputedStats;
382-
}
383-
384372
public String getMetricsEndpoint() {
385373
return metricsEndpoint;
386374
}

communication/src/test/groovy/datadog/communication/ddagent/DDAgentFeaturesDiscoveryTest.groovy

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -462,12 +462,6 @@ class DDAgentFeaturesDiscoveryTest extends DDSpecification {
462462
"tablename",
463463
"topicname"
464464
)
465-
features.spanKindsToComputedStats().containsAll(
466-
"client",
467-
"consumer",
468-
"producer",
469-
"server"
470-
)
471465
}
472466

473467
def "should send container id as header on the info request and parse the hash in the response"() {
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
package datadog.trace.common.metrics;
2+
3+
import static java.util.concurrent.TimeUnit.MICROSECONDS;
4+
import static java.util.concurrent.TimeUnit.SECONDS;
5+
6+
import datadog.communication.ddagent.DDAgentFeaturesDiscovery;
7+
import datadog.communication.monitor.Monitoring;
8+
import datadog.trace.api.WellKnownTags;
9+
import datadog.trace.core.CoreSpan;
10+
import datadog.trace.core.monitor.HealthMetrics;
11+
import datadog.trace.util.Strings;
12+
import java.nio.ByteBuffer;
13+
import java.util.ArrayList;
14+
import java.util.Collections;
15+
import java.util.List;
16+
import java.util.Set;
17+
import org.openjdk.jmh.annotations.Benchmark;
18+
import org.openjdk.jmh.annotations.BenchmarkMode;
19+
import org.openjdk.jmh.annotations.Fork;
20+
import org.openjdk.jmh.annotations.Measurement;
21+
import org.openjdk.jmh.annotations.Mode;
22+
import org.openjdk.jmh.annotations.OutputTimeUnit;
23+
import org.openjdk.jmh.annotations.Scope;
24+
import org.openjdk.jmh.annotations.State;
25+
import org.openjdk.jmh.annotations.Warmup;
26+
import org.openjdk.jmh.infra.Blackhole;
27+
28+
@State(Scope.Benchmark)
29+
@Warmup(iterations = 1, time = 30, timeUnit = SECONDS)
30+
@Measurement(iterations = 3, time = 30, timeUnit = SECONDS)
31+
@BenchmarkMode(Mode.AverageTime)
32+
@OutputTimeUnit(MICROSECONDS)
33+
@Fork(value = 1)
34+
public class ConflatingMetricsAggregatorBenchmark {
35+
private final DDAgentFeaturesDiscovery featuresDiscovery =
36+
new FixedAgentFeaturesDiscovery(
37+
Collections.singleton("peer.hostname"), Collections.emptySet());
38+
private final ConflatingMetricsAggregator aggregator =
39+
new ConflatingMetricsAggregator(
40+
new WellKnownTags("", "", "", "", "", ""),
41+
Collections.emptySet(),
42+
featuresDiscovery,
43+
HealthMetrics.NO_OP,
44+
new NullSink(),
45+
2048,
46+
2048);
47+
private final List<CoreSpan<?>> spans = generateTrace(64);
48+
49+
static List<CoreSpan<?>> generateTrace(int len) {
50+
final List<CoreSpan<?>> trace = new ArrayList<>();
51+
for (int i = 0; i < len; i++) {
52+
SimpleSpan span = new SimpleSpan("", "", "", "", true, true, false, 0, 10, -1);
53+
span.setTag("peer.hostname", Strings.random(10));
54+
trace.add(span);
55+
}
56+
return trace;
57+
}
58+
59+
static class NullSink implements Sink {
60+
61+
@Override
62+
public void register(EventListener listener) {}
63+
64+
@Override
65+
public void accept(int messageCount, ByteBuffer buffer) {}
66+
}
67+
68+
static class FixedAgentFeaturesDiscovery extends DDAgentFeaturesDiscovery {
69+
private final Set<String> peerTags;
70+
private final Set<String> spanKinds;
71+
72+
public FixedAgentFeaturesDiscovery(Set<String> peerTags, Set<String> spanKinds) {
73+
// create a fixed discovery with metrics enabled
74+
super(null, Monitoring.DISABLED, null, false, true);
75+
this.peerTags = peerTags;
76+
this.spanKinds = spanKinds;
77+
}
78+
79+
@Override
80+
public void discover() {
81+
// do nothing
82+
}
83+
84+
@Override
85+
public boolean supportsMetrics() {
86+
return true;
87+
}
88+
89+
@Override
90+
public Set<String> peerTags() {
91+
return peerTags;
92+
}
93+
}
94+
95+
@Benchmark
96+
public void benchmark(Blackhole blackhole) {
97+
blackhole.consume(aggregator.publish(spans));
98+
}
99+
}

dd-trace-core/src/main/java/datadog/trace/common/metrics/ConflatingMetricsAggregator.java

Lines changed: 72 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,28 @@
11
package datadog.trace.common.metrics;
22

33
import static datadog.communication.ddagent.DDAgentFeaturesDiscovery.V6_METRICS_ENDPOINT;
4+
import static datadog.trace.api.DDTags.BASE_SERVICE;
45
import static datadog.trace.api.Functions.UTF8_ENCODE;
56
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND;
7+
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CLIENT;
8+
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_CONSUMER;
9+
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_INTERNAL;
10+
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_PRODUCER;
11+
import static datadog.trace.bootstrap.instrumentation.api.Tags.SPAN_KIND_SERVER;
612
import static datadog.trace.common.metrics.AggregateMetric.ERROR_TAG;
713
import static datadog.trace.common.metrics.AggregateMetric.TOP_LEVEL_TAG;
814
import static datadog.trace.common.metrics.SignalItem.ReportSignal.REPORT;
915
import static datadog.trace.common.metrics.SignalItem.StopSignal.STOP;
1016
import static datadog.trace.util.AgentThreadFactory.AgentThread.METRICS_AGGREGATOR;
1117
import static datadog.trace.util.AgentThreadFactory.THREAD_JOIN_TIMOUT_MS;
1218
import static datadog.trace.util.AgentThreadFactory.newAgentThread;
19+
import static java.util.Collections.unmodifiableSet;
1320
import static java.util.concurrent.TimeUnit.SECONDS;
1421

1522
import datadog.communication.ddagent.DDAgentFeaturesDiscovery;
1623
import datadog.communication.ddagent.SharedCommunicationObjects;
1724
import datadog.trace.api.Config;
25+
import datadog.trace.api.Pair;
1826
import datadog.trace.api.WellKnownTags;
1927
import datadog.trace.api.cache.DDCache;
2028
import datadog.trace.api.cache.DDCaches;
@@ -25,14 +33,18 @@
2533
import datadog.trace.core.DDTraceCoreInfo;
2634
import datadog.trace.core.monitor.HealthMetrics;
2735
import datadog.trace.util.AgentTaskScheduler;
36+
import java.util.ArrayList;
37+
import java.util.Arrays;
2838
import java.util.Collections;
39+
import java.util.HashSet;
2940
import java.util.List;
3041
import java.util.Map;
3142
import java.util.Queue;
3243
import java.util.Set;
3344
import java.util.concurrent.CompletableFuture;
3445
import java.util.concurrent.Future;
3546
import java.util.concurrent.TimeUnit;
47+
import java.util.function.Function;
3648
import org.jctools.maps.NonBlockingHashMap;
3749
import org.jctools.queues.MpscCompoundQueue;
3850
import org.jctools.queues.SpmcArrayQueue;
@@ -49,8 +61,32 @@ public final class ConflatingMetricsAggregator implements MetricsAggregator, Eve
4961
private static final DDCache<String, UTF8BytesString> SERVICE_NAMES =
5062
DDCaches.newFixedSizeCache(32);
5163

64+
private static final DDCache<CharSequence, UTF8BytesString> SPAN_KINDS =
65+
DDCaches.newFixedSizeCache(16);
66+
private static final DDCache<
67+
String, Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>>
68+
PEER_TAGS_CACHE =
69+
DDCaches.newFixedSizeCache(
70+
64); // it can be unbounded since those values are returned by the agent and should be
71+
// under control. 64 entries is enough in this case to contain all the peer tags.
72+
private static final Function<
73+
String, Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>>
74+
PEER_TAGS_CACHE_ADDER =
75+
key ->
76+
Pair.of(
77+
DDCaches.newFixedSizeCache(512),
78+
value -> UTF8BytesString.create(key + ":" + value));
5279
private static final CharSequence SYNTHETICS_ORIGIN = "synthetics";
5380

81+
private static final Set<String> ELIGIBLE_SPAN_KINDS_FOR_METRICS =
82+
unmodifiableSet(
83+
new HashSet<>(
84+
Arrays.asList(
85+
SPAN_KIND_SERVER, SPAN_KIND_CLIENT, SPAN_KIND_CONSUMER, SPAN_KIND_PRODUCER)));
86+
87+
private static final Set<String> ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION =
88+
unmodifiableSet(new HashSet<>(Arrays.asList(SPAN_KIND_CLIENT, SPAN_KIND_PRODUCER)));
89+
5490
private final Set<String> ignoredResources;
5591
private final Queue<Batch> batchPool;
5692
private final NonBlockingHashMap<MetricKey, Batch> pending;
@@ -262,18 +298,23 @@ private boolean shouldComputeMetric(CoreSpan<?> span) {
262298
private boolean spanKindEligible(CoreSpan<?> span) {
263299
final Object spanKind = span.getTag(SPAN_KIND);
264300
// use toString since it could be a CharSequence...
265-
return spanKind != null && features.spanKindsToComputedStats().contains(spanKind.toString());
301+
return spanKind != null && ELIGIBLE_SPAN_KINDS_FOR_METRICS.contains(spanKind.toString());
266302
}
267303

268304
private boolean publish(CoreSpan<?> span, boolean isTopLevel) {
305+
final CharSequence spanKind = span.getTag(SPAN_KIND, "");
269306
MetricKey newKey =
270307
new MetricKey(
271308
span.getResourceName(),
272309
SERVICE_NAMES.computeIfAbsent(span.getServiceName(), UTF8_ENCODE),
273310
span.getOperationName(),
274311
span.getType(),
275312
span.getHttpStatusCode(),
276-
isSynthetic(span));
313+
isSynthetic(span),
314+
span.getParentId() == 0,
315+
SPAN_KINDS.computeIfAbsent(
316+
spanKind, UTF8BytesString::create), // save repeated utf8 conversions
317+
getPeerTags(span, spanKind.toString()));
277318
boolean isNewKey = false;
278319
MetricKey key = keys.putIfAbsent(newKey, newKey);
279320
if (null == key) {
@@ -288,7 +329,7 @@ private boolean publish(CoreSpan<?> span, boolean isTopLevel) {
288329
// returning false means that either the batch can't take any
289330
// more data, or it has already been consumed
290331
if (batch.add(tag, durationNanos)) {
291-
// added to a pending batch prior to consumption
332+
// added to a pending batch prior to consumption,
292333
// so skip publishing to the queue (we also know
293334
// the key isn't rare enough to override the sampler)
294335
return false;
@@ -308,6 +349,34 @@ private boolean publish(CoreSpan<?> span, boolean isTopLevel) {
308349
return isNewKey || span.getError() > 0;
309350
}
310351

352+
private List<UTF8BytesString> getPeerTags(CoreSpan<?> span, String spanKind) {
353+
if (ELIGIBLE_SPAN_KINDS_FOR_PEER_AGGREGATION.contains(spanKind)) {
354+
List<UTF8BytesString> peerTags = new ArrayList<>();
355+
for (String peerTag : features.peerTags()) {
356+
Object value = span.getTag(peerTag);
357+
if (value != null) {
358+
final Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>
359+
cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(peerTag, PEER_TAGS_CACHE_ADDER);
360+
peerTags.add(
361+
cacheAndCreator
362+
.getLeft()
363+
.computeIfAbsent(value.toString(), cacheAndCreator.getRight()));
364+
}
365+
}
366+
return peerTags;
367+
} else if (SPAN_KIND_INTERNAL.equals(spanKind)) {
368+
// in this case only the base service should be aggregated if present
369+
final String baseService = span.getTag(BASE_SERVICE);
370+
if (baseService != null) {
371+
final Pair<DDCache<String, UTF8BytesString>, Function<String, UTF8BytesString>>
372+
cacheAndCreator = PEER_TAGS_CACHE.computeIfAbsent(BASE_SERVICE, PEER_TAGS_CACHE_ADDER);
373+
return Collections.singletonList(
374+
cacheAndCreator.getLeft().computeIfAbsent(baseService, cacheAndCreator.getRight()));
375+
}
376+
}
377+
return Collections.emptyList();
378+
}
379+
311380
private static boolean isSynthetic(CoreSpan<?> span) {
312381
return span.getOrigin() != null && SYNTHETICS_ORIGIN.equals(span.getOrigin().toString());
313382
}

dd-trace-core/src/main/java/datadog/trace/common/metrics/MetricKey.java

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import static datadog.trace.bootstrap.instrumentation.api.UTF8BytesString.EMPTY;
44

55
import datadog.trace.bootstrap.instrumentation.api.UTF8BytesString;
6+
import java.util.Collections;
7+
import java.util.List;
68

79
/** The aggregation key for tracked metrics. */
810
public final class MetricKey {
@@ -13,25 +15,43 @@ public final class MetricKey {
1315
private final int httpStatusCode;
1416
private final boolean synthetics;
1517
private final int hash;
18+
private final boolean isTraceRoot;
19+
private final UTF8BytesString spanKind;
20+
private final List<UTF8BytesString> peerTags;
1621

1722
public MetricKey(
1823
CharSequence resource,
1924
CharSequence service,
2025
CharSequence operationName,
2126
CharSequence type,
2227
int httpStatusCode,
23-
boolean synthetics) {
28+
boolean synthetics,
29+
boolean isTraceRoot,
30+
CharSequence spanKind,
31+
List<UTF8BytesString> peerTags) {
2432
this.resource = null == resource ? EMPTY : UTF8BytesString.create(resource);
2533
this.service = null == service ? EMPTY : UTF8BytesString.create(service);
2634
this.operationName = null == operationName ? EMPTY : UTF8BytesString.create(operationName);
2735
this.type = null == type ? EMPTY : UTF8BytesString.create(type);
2836
this.httpStatusCode = httpStatusCode;
2937
this.synthetics = synthetics;
30-
// unrolled polynomial hashcode which avoids allocating varargs
31-
// the constants are 31^5, 31^4, 31^3, 31^2, 31^1, 31^0
38+
this.isTraceRoot = isTraceRoot;
39+
this.spanKind = null == spanKind ? EMPTY : UTF8BytesString.create(spanKind);
40+
this.peerTags = peerTags == null ? Collections.emptyList() : peerTags;
41+
42+
// Unrolled polynomial hashcode to avoid varargs allocation
43+
// and eliminate data dependency between iterations as in Arrays.hashCode.
44+
// Coefficient constants are powers of 31, with integer overflow (hence negative numbers).
45+
// See
46+
// https://richardstartin.github.io/posts/collecting-rocks-and-benchmarks
47+
// https://richardstartin.github.io/posts/still-true-in-java-9-handwritten-hash-codes-are-faster
48+
3249
this.hash =
33-
28629151 * this.resource.hashCode()
34-
+ 923521 * this.service.hashCode()
50+
-196513505 * Boolean.hashCode(this.isTraceRoot)
51+
+ -1807454463 * this.spanKind.hashCode()
52+
+ 887_503_681 * this.peerTags.hashCode() // possibly unroll here has well.
53+
+ 28_629_151 * this.resource.hashCode()
54+
+ 923_521 * this.service.hashCode()
3555
+ 29791 * this.operationName.hashCode()
3656
+ 961 * this.type.hashCode()
3757
+ 31 * httpStatusCode
@@ -62,6 +82,18 @@ public boolean isSynthetics() {
6282
return synthetics;
6383
}
6484

85+
public boolean isTraceRoot() {
86+
return isTraceRoot;
87+
}
88+
89+
public UTF8BytesString getSpanKind() {
90+
return spanKind;
91+
}
92+
93+
public List<UTF8BytesString> getPeerTags() {
94+
return peerTags;
95+
}
96+
6597
@Override
6698
public boolean equals(Object o) {
6799
if (this == o) {
@@ -75,7 +107,10 @@ public boolean equals(Object o) {
75107
&& resource.equals(metricKey.resource)
76108
&& service.equals(metricKey.service)
77109
&& operationName.equals(metricKey.operationName)
78-
&& type.equals(metricKey.type);
110+
&& type.equals(metricKey.type)
111+
&& isTraceRoot == metricKey.isTraceRoot
112+
&& spanKind.equals(metricKey.spanKind)
113+
&& peerTags.equals(metricKey.peerTags);
79114
}
80115
return false;
81116
}

0 commit comments

Comments
 (0)