Skip to content

Commit cdf0eb0

Browse files
committed
otel: implement retry metrics
1 parent d124007 commit cdf0eb0

File tree

4 files changed

+110
-11
lines changed

4 files changed

+110
-11
lines changed

opentelemetry/src/main/java/io/grpc/opentelemetry/GrpcOpenTelemetry.java

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import static com.google.common.base.Preconditions.checkNotNull;
2020
import static io.grpc.internal.GrpcUtil.IMPLEMENTATION_VERSION;
2121
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.LATENCY_BUCKETS;
22+
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.RETRY_BUCKETS;
2223
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.SIZE_BUCKETS;
2324

2425
import com.google.common.annotations.VisibleForTesting;
@@ -241,6 +242,30 @@ static OpenTelemetryMetricsResource createMetricInstruments(Meter meter,
241242
.build());
242243
}
243244

245+
if (isMetricEnabled("grpc.client.call.retries", enableMetrics,
246+
disableDefault)) {
247+
builder.clientCallRetriesCounter(
248+
meter.histogramBuilder(
249+
"grpc.client.call.retries")
250+
.setUnit("{retry}")
251+
.setDescription("Number of retry attempts made during the client call")
252+
.ofLongs()
253+
.setExplicitBucketBoundariesAdvice(RETRY_BUCKETS)
254+
.build());
255+
}
256+
257+
if (isMetricEnabled("grpc.client.call.retry_delay", enableMetrics,
258+
disableDefault)) {
259+
builder.clientCallRetryDelayCounter(
260+
meter.histogramBuilder(
261+
"grpc.client.call.retry_delay")
262+
.setUnit("s")
263+
.setDescription("Total time of delay while there is no active attempt during the " +
264+
"client call")
265+
.setExplicitBucketBoundariesAdvice(LATENCY_BUCKETS)
266+
.build());
267+
}
268+
244269
if (isMetricEnabled("grpc.server.call.started", enableMetrics, disableDefault)) {
245270
builder.serverCallCountCounter(
246271
meter.counterBuilder("grpc.server.call.started")
@@ -259,8 +284,8 @@ static OpenTelemetryMetricsResource createMetricInstruments(Meter meter,
259284
.build());
260285
}
261286

262-
if (isMetricEnabled("grpc.server.call.sent_total_compressed_message_size", enableMetrics,
263-
disableDefault)) {
287+
if (isMetricEnabled("grpc.server.call.sent_total_compressed_message_size",
288+
enableMetrics, disableDefault)) {
264289
builder.serverTotalSentCompressedMessageSizeCounter(
265290
meter.histogramBuilder(
266291
"grpc.server.call.sent_total_compressed_message_size")
@@ -271,8 +296,8 @@ static OpenTelemetryMetricsResource createMetricInstruments(Meter meter,
271296
.build());
272297
}
273298

274-
if (isMetricEnabled("grpc.server.call.rcvd_total_compressed_message_size", enableMetrics,
275-
disableDefault)) {
299+
if (isMetricEnabled("grpc.server.call.rcvd_total_compressed_message_size",
300+
enableMetrics, disableDefault)) {
276301
builder.serverTotalReceivedCompressedMessageSizeCounter(
277302
meter.histogramBuilder(
278303
"grpc.server.call.rcvd_total_compressed_message_size")

opentelemetry/src/main/java/io/grpc/opentelemetry/OpenTelemetryMetricsModule.java

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.BACKEND_SERVICE_KEY;
2121
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.LOCALITY_KEY;
2222
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.METHOD_KEY;
23+
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.RETRY_TYPE_KEY;
2324
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.STATUS_KEY;
2425
import static io.grpc.opentelemetry.internal.OpenTelemetryConstants.TARGET_KEY;
2526

@@ -44,6 +45,7 @@
4445
import io.grpc.Status;
4546
import io.grpc.Status.Code;
4647
import io.grpc.StreamTracer;
48+
import io.grpc.opentelemetry.internal.OpenTelemetryConstants;
4749
import io.opentelemetry.api.common.AttributesBuilder;
4850
import java.util.ArrayList;
4951
import java.util.Collection;
@@ -71,6 +73,7 @@
7173
*/
7274
final class OpenTelemetryMetricsModule {
7375
private static final Logger logger = Logger.getLogger(OpenTelemetryMetricsModule.class.getName());
76+
private static final double NANOS_PER_SEC = 1_000_000_000.0;
7477
public static final ImmutableSet<String> DEFAULT_PER_CALL_METRICS_SET =
7578
ImmutableSet.of(
7679
"grpc.client.attempt.started",
@@ -292,9 +295,11 @@ static final class CallAttemptsTracerFactory extends ClientStreamTracer.Factory
292295
private final String fullMethodName;
293296
private final List<OpenTelemetryPlugin.ClientCallPlugin> callPlugins;
294297
private Status status;
298+
private long retryDelayNanos;
295299
private long callLatencyNanos;
296300
private final Object lock = new Object();
297301
private final AtomicLong attemptsPerCall = new AtomicLong();
302+
private final AtomicLong transparentRetriesPerCall = new AtomicLong();
298303
@GuardedBy("lock")
299304
private int activeStreams;
300305
@GuardedBy("lock")
@@ -331,6 +336,7 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada
331336
}
332337
if (++activeStreams == 1 && attemptStopwatch.isRunning()) {
333338
attemptStopwatch.stop();
339+
retryDelayNanos = attemptStopwatch.elapsed(TimeUnit.NANOSECONDS);
334340
}
335341
}
336342
// Skip recording for the first time, since it is already recorded in
@@ -344,7 +350,9 @@ public ClientStreamTracer newClientStreamTracer(StreamInfo info, Metadata metada
344350
module.resource.clientAttemptCountCounter().add(1, attribute);
345351
}
346352
}
347-
if (!info.isTransparentRetry()) {
353+
if (info.isTransparentRetry()) {
354+
transparentRetriesPerCall.incrementAndGet();
355+
} else {
348356
attemptsPerCall.incrementAndGet();
349357
}
350358
return newClientTracer(info);
@@ -407,14 +415,49 @@ void recordFinishedCall() {
407415
tracer.recordFinishedAttempt();
408416
}
409417
callLatencyNanos = callStopWatch.elapsed(TimeUnit.NANOSECONDS);
410-
io.opentelemetry.api.common.Attributes attribute =
411-
io.opentelemetry.api.common.Attributes.of(METHOD_KEY, fullMethodName,
412-
TARGET_KEY, target,
413-
STATUS_KEY, status.getCode().toString());
414418

415419
if (module.resource.clientCallDurationCounter() != null) {
416-
module.resource.clientCallDurationCounter()
417-
.record(callLatencyNanos * SECONDS_PER_NANO, attribute);
420+
long retriesPerCall = 0;
421+
long attempts = attemptsPerCall.get();
422+
if (attempts > 0) {
423+
retriesPerCall = attempts - 1;
424+
}
425+
426+
// Base attributes
427+
io.opentelemetry.api.common.Attributes baseAttributes =
428+
io.opentelemetry.api.common.Attributes.of(
429+
METHOD_KEY, fullMethodName,
430+
TARGET_KEY, target
431+
);
432+
433+
// Duration
434+
module.resource.clientCallDurationCounter().record(
435+
callLatencyNanos * SECONDS_PER_NANO,
436+
baseAttributes.toBuilder()
437+
.put(STATUS_KEY, status.getCode().toString())
438+
.build()
439+
);
440+
441+
// Retry counts
442+
module.resource.clientCallRetriesCounter().record(
443+
retriesPerCall,
444+
baseAttributes.toBuilder()
445+
.put(RETRY_TYPE_KEY, OpenTelemetryConstants.RetryType.RETRY.getValue())
446+
.build()
447+
);
448+
449+
module.resource.clientCallRetriesCounter().record(
450+
transparentRetriesPerCall.get(),
451+
baseAttributes.toBuilder()
452+
.put(RETRY_TYPE_KEY, OpenTelemetryConstants.RetryType.TRANSPARENT.getValue())
453+
.build()
454+
);
455+
456+
// Retry delay
457+
module.resource.clientCallRetryDelayCounter().record(
458+
retryDelayNanos / NANOS_PER_SEC,
459+
baseAttributes
460+
);
418461
}
419462
}
420463
}

opentelemetry/src/main/java/io/grpc/opentelemetry/OpenTelemetryMetricsResource.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ abstract class OpenTelemetryMetricsResource {
4141
@Nullable
4242
abstract LongHistogram clientTotalReceivedCompressedMessageSizeCounter();
4343

44+
@Nullable
45+
abstract LongHistogram clientCallRetriesCounter();
46+
47+
@Nullable
48+
abstract DoubleHistogram clientCallRetryDelayCounter();
4449

4550
/* Server Metrics */
4651
@Nullable
@@ -73,6 +78,10 @@ abstract static class Builder {
7378
abstract Builder clientTotalReceivedCompressedMessageSizeCounter(
7479
LongHistogram counter);
7580

81+
abstract Builder clientCallRetriesCounter(LongHistogram counter);
82+
83+
abstract Builder clientCallRetryDelayCounter(DoubleHistogram counter);
84+
7685
abstract Builder serverCallCountCounter(LongCounter counter);
7786

7887
abstract Builder serverCallDurationCounter(DoubleHistogram counter);

opentelemetry/src/main/java/io/grpc/opentelemetry/internal/OpenTelemetryConstants.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ public final class OpenTelemetryConstants {
3636
public static final AttributeKey<String> BACKEND_SERVICE_KEY =
3737
AttributeKey.stringKey("grpc.lb.backend_service");
3838

39+
public static final AttributeKey<String> RETRY_TYPE_KEY =
40+
AttributeKey.stringKey("grpc.retry_type");
41+
3942
public static final List<Double> LATENCY_BUCKETS =
4043
ImmutableList.of(
4144
0d, 0.00001d, 0.00005d, 0.0001d, 0.0003d, 0.0006d, 0.0008d, 0.001d, 0.002d,
@@ -49,6 +52,25 @@ public final class OpenTelemetryConstants {
4952
0L, 1024L, 2048L, 4096L, 16384L, 65536L, 262144L, 1048576L, 4194304L, 16777216L,
5053
67108864L, 268435456L, 1073741824L, 4294967296L);
5154

55+
public static final List<Long> RETRY_BUCKETS =
56+
ImmutableList.of(0L, 1L, 2L, 3L, 4L, 5L, 10L, 100L, 1000L);
57+
58+
public enum RetryType {
59+
RETRY("retry"),
60+
HEDGE("hedge"),
61+
TRANSPARENT("transparent");
62+
63+
private final String value;
64+
65+
RetryType(String value) {
66+
this.value = value;
67+
}
68+
69+
public String getValue() {
70+
return value;
71+
}
72+
}
73+
5274
private OpenTelemetryConstants() {
5375
}
5476
}

0 commit comments

Comments
 (0)