Skip to content

Commit 709f2c4

Browse files
Applying diagnostic thresholds for request-level metrics (Azure#35114)
* Add threshold based filtering of transport level metrics * Adding Test coverage * Update CHANGELOG.md * Reacted to code review comments * Address code review feedback
1 parent 6d1c57e commit 709f2c4

File tree

7 files changed

+173
-20
lines changed

7 files changed

+173
-20
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/ClientMetricsTest.java

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import io.micrometer.core.instrument.Tag;
5151
import io.micrometer.core.instrument.Timer;
5252
import io.netty.handler.codec.http.HttpResponseStatus;
53+
import org.testng.SkipException;
5354
import org.testng.annotations.Factory;
5455
import org.testng.annotations.Test;
5556

@@ -94,6 +95,10 @@ private EnumSet<MetricCategory> getEffectiveMetricCategories() {
9495
}
9596

9697
public void beforeTest(CosmosMetricCategory... metricCategories) {
98+
beforeTest(null, metricCategories);
99+
}
100+
101+
public void beforeTest(CosmosDiagnosticsThresholds thresholds, CosmosMetricCategory... metricCategories) {
97102
assertThat(this.client).isNull();
98103
assertThat(this.meterRegistry).isNull();
99104

@@ -102,9 +107,16 @@ public void beforeTest(CosmosMetricCategory... metricCategories) {
102107
this.inputMetricsOptions = new CosmosMicrometerMetricsOptions()
103108
.meterRegistry(this.meterRegistry)
104109
.setMetricCategories(metricCategories);
110+
105111
this.inputClientTelemetryConfig = new CosmosClientTelemetryConfig()
106112
.metricsOptions(this.inputMetricsOptions);
107113

114+
115+
if (thresholds != null) {
116+
this.inputClientTelemetryConfig.diagnosticsThresholds(thresholds);
117+
this.inputMetricsOptions.applyDiagnosticThresholdsForTransportLevelMeters(true);
118+
}
119+
108120
this.client = getClientBuilder()
109121
.clientTelemetryConfig(inputClientTelemetryConfig)
110122
.buildClient();
@@ -353,6 +365,54 @@ public void readItem() throws Exception {
353365
}
354366
}
355367

368+
private void runReadItemTestWithThresholds(
369+
CosmosDiagnosticsThresholds thresholds,
370+
boolean expectRequestMetrics
371+
) {
372+
this.beforeTest(thresholds, CosmosMetricCategory.DEFAULT);
373+
try {
374+
375+
if (this.client.asyncClient().getConnectionPolicy().getConnectionMode() != ConnectionMode.DIRECT) {
376+
throw new SkipException("Test case only relevant for direct model.");
377+
}
378+
379+
InternalObjectNode properties = getDocumentDefinition(UUID.randomUUID().toString());
380+
container.createItem(properties);
381+
382+
CosmosItemResponse<InternalObjectNode> readResponse1 = container.readItem(properties.getId(),
383+
new PartitionKey(ModelBridgeInternal.getObjectFromJsonSerializable(properties, "mypk")),
384+
new CosmosItemRequestOptions(),
385+
InternalObjectNode.class);
386+
validateItemResponse(properties, readResponse1);
387+
388+
CosmosDiagnosticsThresholds maxThresholds = new CosmosDiagnosticsThresholds()
389+
.setPointOperationLatencyThreshold(Duration.ofDays(1));
390+
391+
Tag operationTag = Tag.of(TagName.OperationStatusCode.toString(), "200");
392+
Tag requestTag = Tag.of(TagName.RequestStatusCode.toString(), "200/0");
393+
this.assertMetrics("cosmos.client.op.latency", true, operationTag);
394+
this.assertMetrics("cosmos.client.op.calls", true, operationTag);
395+
this.assertMetrics("cosmos.client.req.rntbd.latency", expectRequestMetrics, requestTag);
396+
this.assertMetrics("cosmos.client.req.rntbd.backendLatency", expectRequestMetrics, requestTag);
397+
this.assertMetrics("cosmos.client.req.rntbd.requests", expectRequestMetrics, requestTag);
398+
Meter reportedRntbdRequestCharge =
399+
this.assertMetrics("cosmos.client.req.rntbd.RUs", expectRequestMetrics, requestTag);
400+
} finally {
401+
this.afterTest();
402+
}
403+
}
404+
405+
@Test(groups = { "simple" }, timeOut = TIMEOUT)
406+
public void readItemWithThresholdsApplied() throws Exception {
407+
CosmosDiagnosticsThresholds maxThresholds = new CosmosDiagnosticsThresholds()
408+
.setPointOperationLatencyThreshold(Duration.ofDays(1));
409+
CosmosDiagnosticsThresholds minThresholds = new CosmosDiagnosticsThresholds()
410+
.setPointOperationLatencyThreshold(Duration.ZERO);
411+
412+
runReadItemTestWithThresholds(maxThresholds, false);
413+
runReadItemTestWithThresholds(minThresholds, true);
414+
}
415+
356416
@Test(groups = { "simple" }, timeOut = TIMEOUT)
357417
public void replaceItem() throws Exception {
358418
this.beforeTest(CosmosMetricCategory.DEFAULT);

sdk/cosmos/azure-cosmos/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
### 4.46.0-beta.1 (Unreleased)
44

55
#### Features Added
6+
* Added the capability to filter request-level metrics based on diagnostic thresholds. Request-level metrics usually are used to capture metrics per backend endpoint/replica - a high cardinality dimension. Filtering by diagnostic thresholds reduces the overhead - but also means request-level metrics can only be used for debugging purposes - not for monitoring purposes. So, it is important to use the unfiltered operation-level metrics for health monitoring in this case. - See [PR 35114](https://github.com/Azure/azure-sdk-for-java/pull/35114)
67

78
#### Breaking Changes
89

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/clienttelemetry/ClientTelemetryMetrics.java

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -440,15 +440,19 @@ public void recordOperation(
440440
for (ClientSideRequestStatistics requestStatistics : clientSideRequestStatistics) {
441441

442442
recordStoreResponseStatistics(
443+
diagnosticsContext,
443444
cosmosAsyncClient,
444445
requestStatistics.getResponseStatisticsList());
445446
recordStoreResponseStatistics(
447+
diagnosticsContext,
446448
cosmosAsyncClient,
447449
requestStatistics.getSupplementalResponseStatisticsList());
448450
recordGatewayStatistics(
451+
diagnosticsContext,
449452
cosmosAsyncClient,
450453
requestStatistics.getDuration(), requestStatistics.getGatewayStatistics());
451454
recordAddressResolutionStatistics(
455+
diagnosticsContext,
452456
cosmosAsyncClient,
453457
requestStatistics.getAddressResolutionStatistics());
454458
}
@@ -464,11 +468,12 @@ public void recordOperation(
464468
QueryInfo.QueryPlanDiagnosticsContext queryPlanDiagnostics =
465469
feedDiagnostics.getQueryPlanDiagnosticsContext();
466470

467-
recordQueryPlanDiagnostics(cosmosAsyncClient, queryPlanDiagnostics);
471+
recordQueryPlanDiagnostics(diagnosticsContext, cosmosAsyncClient, queryPlanDiagnostics);
468472
}
469473
}
470474

471475
private void recordQueryPlanDiagnostics(
476+
CosmosDiagnosticsContext ctx,
472477
CosmosAsyncClient cosmosAsyncClient,
473478
QueryInfo.QueryPlanDiagnosticsContext queryPlanDiagnostics
474479
) {
@@ -483,7 +488,8 @@ private void recordQueryPlanDiagnostics(
483488
CosmosMeterOptions requestsOptions = clientAccessor.getMeterOptions(
484489
cosmosAsyncClient,
485490
CosmosMetricName.REQUEST_SUMMARY_GATEWAY_REQUESTS);
486-
if (requestsOptions.isEnabled()) {
491+
if (requestsOptions.isEnabled() &&
492+
(!requestsOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
487493
Counter requestCounter = Counter
488494
.builder(requestsOptions.getMeterName().toString())
489495
.baseUnit("requests")
@@ -499,7 +505,8 @@ private void recordQueryPlanDiagnostics(
499505
CosmosMeterOptions latencyOptions = clientAccessor.getMeterOptions(
500506
cosmosAsyncClient,
501507
CosmosMetricName.REQUEST_SUMMARY_GATEWAY_LATENCY);
502-
if (latencyOptions.isEnabled()) {
508+
if (latencyOptions.isEnabled() &&
509+
(!latencyOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
503510
Timer requestLatencyMeter = Timer
504511
.builder(latencyOptions.getMeterName().toString())
505512
.description("Gateway Request latency")
@@ -513,20 +520,23 @@ private void recordQueryPlanDiagnostics(
513520
}
514521

515522
recordRequestTimeline(
523+
ctx,
516524
cosmosAsyncClient,
517525
CosmosMetricName.REQUEST_DETAILS_GATEWAY_TIMELINE,
518526
queryPlanDiagnostics.getRequestTimeline(), requestTags);
519527
}
520528

521529
private void recordRequestPayloadSizes(
530+
CosmosDiagnosticsContext ctx,
522531
CosmosAsyncClient client,
523532
int requestPayloadSizeInBytes,
524533
int responsePayloadSizeInBytes
525534
) {
526535
CosmosMeterOptions reqSizeOptions = clientAccessor.getMeterOptions(
527536
client,
528537
CosmosMetricName.REQUEST_SUMMARY_SIZE_REQUEST);
529-
if (reqSizeOptions.isEnabled()) {
538+
if (reqSizeOptions.isEnabled() &&
539+
(!reqSizeOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
530540
DistributionSummary requestPayloadSizeMeter = DistributionSummary
531541
.builder(reqSizeOptions.getMeterName().toString())
532542
.baseUnit("bytes")
@@ -542,7 +552,8 @@ private void recordRequestPayloadSizes(
542552
CosmosMeterOptions rspSizeOptions = clientAccessor.getMeterOptions(
543553
client,
544554
CosmosMetricName.REQUEST_SUMMARY_SIZE_RESPONSE);
545-
if (rspSizeOptions.isEnabled()) {
555+
if (rspSizeOptions.isEnabled() &&
556+
(!rspSizeOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
546557
DistributionSummary responsePayloadSizeMeter = DistributionSummary
547558
.builder(rspSizeOptions.getMeterName().toString())
548559
.baseUnit("bytes")
@@ -752,6 +763,7 @@ private void recordRntbdEndpointStatistics(
752763
}
753764

754765
private void recordRequestTimeline(
766+
CosmosDiagnosticsContext ctx,
755767
CosmosAsyncClient client,
756768
CosmosMetricName name,
757769
RequestTimeline requestTimeline,
@@ -764,7 +776,8 @@ private void recordRequestTimeline(
764776
CosmosMeterOptions timelineOptions = clientAccessor.getMeterOptions(
765777
client,
766778
name);
767-
if (!timelineOptions.isEnabled()) {
779+
if (!timelineOptions.isEnabled() ||
780+
(timelineOptions.isDiagnosticThresholdsFilteringEnabled() && !ctx.isThresholdViolated())) {
768781
return;
769782
}
770783
for (RequestTimeline.Event event : requestTimeline) {
@@ -786,6 +799,7 @@ private void recordRequestTimeline(
786799
}
787800

788801
private void recordStoreResponseStatistics(
802+
CosmosDiagnosticsContext ctx,
789803
CosmosAsyncClient client,
790804
List<ClientSideRequestStatistics.StoreResponseStatistics> storeResponseStatistics) {
791805

@@ -818,7 +832,8 @@ private void recordStoreResponseStatistics(
818832
CosmosMeterOptions beLatencyOptions = clientAccessor.getMeterOptions(
819833
client,
820834
CosmosMetricName.REQUEST_SUMMARY_DIRECT_BACKEND_LATENCY);
821-
if (beLatencyOptions.isEnabled()) {
835+
if (beLatencyOptions.isEnabled() &&
836+
(!beLatencyOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
822837
DistributionSummary backendRequestLatencyMeter = DistributionSummary
823838
.builder(beLatencyOptions.getMeterName().toString())
824839
.baseUnit("ms")
@@ -835,7 +850,8 @@ private void recordStoreResponseStatistics(
835850
CosmosMeterOptions ruOptions = clientAccessor.getMeterOptions(
836851
client,
837852
CosmosMetricName.REQUEST_SUMMARY_DIRECT_REQUEST_CHARGE);
838-
if (ruOptions.isEnabled()) {
853+
if (ruOptions.isEnabled() &&
854+
(!ruOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
839855
double requestCharge = storeResponseDiagnostics.getRequestCharge();
840856
DistributionSummary requestChargeMeter = DistributionSummary
841857
.builder(ruOptions.getMeterName().toString())
@@ -852,7 +868,8 @@ private void recordStoreResponseStatistics(
852868
CosmosMeterOptions latencyOptions = clientAccessor.getMeterOptions(
853869
client,
854870
CosmosMetricName.REQUEST_SUMMARY_DIRECT_LATENCY);
855-
if (latencyOptions.isEnabled()) {
871+
if (latencyOptions.isEnabled() &&
872+
(!latencyOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
856873
Duration latency = responseStatistics.getDuration();
857874
if (latency != null) {
858875
Timer requestLatencyMeter = Timer
@@ -870,7 +887,8 @@ private void recordStoreResponseStatistics(
870887
CosmosMeterOptions reqOptions = clientAccessor.getMeterOptions(
871888
client,
872889
CosmosMetricName.REQUEST_SUMMARY_DIRECT_REQUESTS);
873-
if (reqOptions.isEnabled()) {
890+
if (reqOptions.isEnabled() &&
891+
(!reqOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
874892
Counter requestCounter = Counter
875893
.builder(reqOptions.getMeterName().toString())
876894
.baseUnit("requests")
@@ -882,12 +900,14 @@ private void recordStoreResponseStatistics(
882900

883901
if (this.metricCategories.contains(MetricCategory.RequestDetails)) {
884902
recordRequestTimeline(
903+
ctx,
885904
client,
886905
CosmosMetricName.REQUEST_DETAILS_DIRECT_TIMELINE,
887906
storeResponseDiagnostics.getRequestTimeline(), requestTags);
888907
}
889908

890909
recordRequestPayloadSizes(
910+
ctx,
891911
client,
892912
storeResponseDiagnostics.getRequestPayloadLength(),
893913
storeResponseDiagnostics.getResponsePayloadLength()
@@ -901,6 +921,7 @@ private void recordStoreResponseStatistics(
901921
}
902922

903923
private void recordGatewayStatistics(
924+
CosmosDiagnosticsContext ctx,
904925
CosmosAsyncClient client,
905926
Duration latency,
906927
ClientSideRequestStatistics.GatewayStatistics gatewayStatistics) {
@@ -930,7 +951,8 @@ private void recordGatewayStatistics(
930951
CosmosMeterOptions reqOptions = clientAccessor.getMeterOptions(
931952
client,
932953
CosmosMetricName.REQUEST_SUMMARY_GATEWAY_REQUESTS);
933-
if (reqOptions.isEnabled()) {
954+
if (reqOptions.isEnabled() &&
955+
(!reqOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
934956
Counter requestCounter = Counter
935957
.builder(reqOptions.getMeterName().toString())
936958
.baseUnit("requests")
@@ -943,7 +965,8 @@ private void recordGatewayStatistics(
943965
CosmosMeterOptions ruOptions = clientAccessor.getMeterOptions(
944966
client,
945967
CosmosMetricName.REQUEST_SUMMARY_GATEWAY_REQUEST_CHARGE);
946-
if (ruOptions.isEnabled()) {
968+
if (ruOptions.isEnabled() &&
969+
(!ruOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
947970
double requestCharge = gatewayStatistics.getRequestCharge();
948971
DistributionSummary requestChargeMeter = DistributionSummary
949972
.builder(ruOptions.getMeterName().toString())
@@ -961,7 +984,8 @@ private void recordGatewayStatistics(
961984
CosmosMeterOptions latencyOptions = clientAccessor.getMeterOptions(
962985
client,
963986
CosmosMetricName.REQUEST_SUMMARY_GATEWAY_LATENCY);
964-
if (latencyOptions.isEnabled()) {
987+
if (latencyOptions.isEnabled() &&
988+
(!latencyOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
965989
Timer requestLatencyMeter = Timer
966990
.builder(latencyOptions.getMeterName().toString())
967991
.description("Gateway Request latency")
@@ -975,12 +999,14 @@ private void recordGatewayStatistics(
975999
}
9761000

9771001
recordRequestTimeline(
1002+
ctx,
9781003
client,
9791004
CosmosMetricName.REQUEST_DETAILS_GATEWAY_TIMELINE,
9801005
gatewayStatistics.getRequestTimeline(), requestTags);
9811006
}
9821007

9831008
private void recordAddressResolutionStatistics(
1009+
CosmosDiagnosticsContext ctx,
9841010
CosmosAsyncClient client,
9851011
Map<String, ClientSideRequestStatistics.AddressResolutionStatistics> addressResolutionStatisticsMap) {
9861012

@@ -1019,7 +1045,8 @@ private void recordAddressResolutionStatistics(
10191045
CosmosMeterOptions latencyOptions = clientAccessor.getMeterOptions(
10201046
client,
10211047
CosmosMetricName.DIRECT_ADDRESS_RESOLUTION_LATENCY);
1022-
if (latencyOptions.isEnabled()) {
1048+
if (latencyOptions.isEnabled() &&
1049+
(!latencyOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
10231050
Timer addressResolutionLatencyMeter = Timer
10241051
.builder(latencyOptions.getMeterName().toString())
10251052
.description("Address resolution latency")
@@ -1034,7 +1061,8 @@ private void recordAddressResolutionStatistics(
10341061
CosmosMeterOptions reqOptions = clientAccessor.getMeterOptions(
10351062
client,
10361063
CosmosMetricName.DIRECT_ADDRESS_RESOLUTION_REQUESTS);
1037-
if (reqOptions.isEnabled()) {
1064+
if (reqOptions.isEnabled() &&
1065+
(!reqOptions.isDiagnosticThresholdsFilteringEnabled() || ctx.isThresholdViolated())) {
10381066
Counter requestCounter = Counter
10391067
.builder(reqOptions.getMeterName().toString())
10401068
.baseUnit("requests")

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/clienttelemetry/CosmosMeterOptions.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,22 @@ public final class CosmosMeterOptions {
1515
private final boolean isHistogramPublishingEnabled;
1616
private final EnumSet<TagName> suppressedTagNames;
1717

18+
private final boolean isDiagnosticThresholdsFilteringEnabled;
19+
1820
public CosmosMeterOptions(
1921
CosmosMetricName name,
2022
boolean isEnabled,
2123
double[] percentiles,
2224
boolean isHistogramPublishingEnabled,
23-
EnumSet<TagName> suppressedTagNames) {
25+
EnumSet<TagName> suppressedTagNames,
26+
boolean isDiagnosticThresholdsFilteringEnabled) {
2427

2528
this.name = name;
2629
this.isEnabled = isEnabled;
2730
this.percentiles = percentiles != null ? percentiles.clone() : new double[0];
2831
this.isHistogramPublishingEnabled = isHistogramPublishingEnabled;
2932
this.suppressedTagNames = suppressedTagNames;
33+
this.isDiagnosticThresholdsFilteringEnabled = isDiagnosticThresholdsFilteringEnabled;
3034
}
3135

3236
public CosmosMetricName getMeterName() {
@@ -48,4 +52,6 @@ public double[] getPercentiles() {
4852
public boolean isEnabled() {
4953
return this.isEnabled;
5054
}
55+
56+
public boolean isDiagnosticThresholdsFilteringEnabled() { return this.isDiagnosticThresholdsFilteringEnabled; }
5157
}

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/models/CosmosClientTelemetryConfig.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,8 @@ public CosmosMeterOptions createDisabledMeterOptions(CosmosMetricName name) {
505505
false,
506506
new double[0],
507507
false,
508-
EnumSet.noneOf(TagName.class));
508+
EnumSet.noneOf(TagName.class),
509+
false);
509510
}
510511

511512
@Override

0 commit comments

Comments
 (0)