Skip to content

Commit 3566ee9

Browse files
authored
Track RequestedRangeNotSatisfiedException separately in S3 Metrics (#109657)
Due to RCO changes, we started getting a lot of `RequestedRangeNotSatisfiedExceptions` which are expected. We would like track them separately. This change adds two new metrics to track all client errors analogous to other S3 errors. * es.repositories.exceptions.request_range_not_satisfied.total * es.repositories.exceptions.request_range_not_satisfied.histogram In the future, we can add the error code as an attribute to the metrics, so we can adapt it to all client errors.
1 parent e79ee17 commit 3566ee9

File tree

4 files changed

+88
-11
lines changed

4 files changed

+88
-11
lines changed

docs/changelog/109657.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 109657
2+
summary: Track `RequestedRangeNotSatisfiedException` separately in S3 Metrics
3+
area: Snapshot/Restore
4+
type: enhancement
5+
issues: []

modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java

Lines changed: 57 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import org.elasticsearch.cluster.node.DiscoveryNode;
1515
import org.elasticsearch.common.blobstore.BlobContainer;
1616
import org.elasticsearch.common.blobstore.BlobPath;
17-
import org.elasticsearch.common.blobstore.BlobStore;
1817
import org.elasticsearch.common.blobstore.OperationPurpose;
1918
import org.elasticsearch.common.bytes.BytesArray;
2019
import org.elasticsearch.common.collect.Iterators;
@@ -23,6 +22,7 @@
2322
import org.elasticsearch.plugins.PluginsService;
2423
import org.elasticsearch.repositories.RepositoriesService;
2524
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
25+
import org.elasticsearch.repositories.blobstore.RequestedRangeNotSatisfiedException;
2626
import org.elasticsearch.repositories.s3.S3BlobStore.Operation;
2727
import org.elasticsearch.rest.RestStatus;
2828
import org.elasticsearch.telemetry.Measurement;
@@ -39,6 +39,7 @@
3939

4040
import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM;
4141
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM;
42+
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL;
4243
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_TOTAL;
4344
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_OPERATIONS_TOTAL;
4445
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_REQUESTS_TOTAL;
@@ -47,8 +48,10 @@
4748
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL;
4849
import static org.elasticsearch.rest.RestStatus.INTERNAL_SERVER_ERROR;
4950
import static org.elasticsearch.rest.RestStatus.NOT_FOUND;
51+
import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;
5052
import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS;
5153
import static org.hamcrest.Matchers.equalTo;
54+
import static org.hamcrest.Matchers.instanceOf;
5255

5356
@SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
5457
// Need to set up a new cluster for each test because cluster settings use randomized authentication settings
@@ -80,22 +83,29 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
8083
.build();
8184
}
8285

83-
public void testMetricsWithErrors() throws IOException {
84-
final String repository = createRepository(randomRepositoryName());
85-
86-
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
87-
final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
88-
.repository(repository);
89-
final BlobStore blobStore = blobStoreRepository.blobStore();
90-
final TestTelemetryPlugin plugin = internalCluster().getInstance(PluginsService.class, dataNodeName)
86+
private static TestTelemetryPlugin getPlugin(String dataNodeName) {
87+
var plugin = internalCluster().getInstance(PluginsService.class, dataNodeName)
9188
.filterPlugins(TestTelemetryPlugin.class)
9289
.findFirst()
9390
.orElseThrow();
94-
9591
plugin.resetMeter();
92+
return plugin;
93+
}
94+
95+
private static BlobContainer getBlobContainer(String dataNodeName, String repository) {
96+
final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
97+
.repository(repository);
98+
return blobStoreRepository.blobStore().blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
99+
}
100+
101+
public void testMetricsWithErrors() throws IOException {
102+
final String repository = createRepository(randomRepositoryName());
103+
104+
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
105+
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
96106

97107
final OperationPurpose purpose = randomFrom(OperationPurpose.values());
98-
final BlobContainer blobContainer = blobStore.blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
108+
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
99109
final String blobName = randomIdentifier();
100110

101111
// Put a blob
@@ -132,6 +142,9 @@ public void testMetricsWithErrors() throws IOException {
132142
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
133143
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
134144
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
145+
146+
// Make sure we don't hit the request range not satisfied counters
147+
assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), equalTo(0L));
135148
}
136149

137150
// List retry exhausted
@@ -166,6 +179,39 @@ public void testMetricsWithErrors() throws IOException {
166179
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
167180
}
168181

182+
public void testMetricsForRequestRangeNotSatisfied() {
183+
final String repository = createRepository(randomRepositoryName());
184+
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
185+
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
186+
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
187+
188+
final OperationPurpose purpose = randomFrom(OperationPurpose.values());
189+
final String blobName = randomIdentifier();
190+
191+
for (int i = 0; i < randomIntBetween(1, 3); i++) {
192+
final long batch = i + 1;
193+
addErrorStatus(TOO_MANY_REQUESTS, TOO_MANY_REQUESTS, REQUESTED_RANGE_NOT_SATISFIED);
194+
try {
195+
blobContainer.readBlob(purpose, blobName).close();
196+
} catch (Exception e) {
197+
assertThat(e, instanceOf(RequestedRangeNotSatisfiedException.class));
198+
}
199+
200+
assertThat(getLongCounterValue(plugin, METRIC_REQUESTS_TOTAL, Operation.GET_OBJECT), equalTo(3 * batch));
201+
assertThat(getLongCounterValue(plugin, METRIC_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
202+
assertThat(getLongCounterValue(plugin, METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
203+
assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
204+
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
205+
assertThat(
206+
getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT),
207+
equalTo(batch)
208+
);
209+
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(2 * batch));
210+
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(2 * batch));
211+
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
212+
}
213+
}
214+
169215
private void addErrorStatus(RestStatus... statuses) {
170216
errorStatusQueue.addAll(Arrays.asList(statuses));
171217
}

modules/repository-s3/src/main/java/org/elasticsearch/repositories/s3/S3BlobStore.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
import java.util.stream.Collectors;
5353

5454
import static org.elasticsearch.core.Strings.format;
55+
import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;
5556

5657
class S3BlobStore implements BlobStore {
5758

@@ -177,6 +178,23 @@ public final void collectMetrics(Request<?> request, Response<?> response) {
177178
.map(List::size)
178179
.orElse(0);
179180

181+
if (exceptionCount > 0) {
182+
final List<Object> statusCodes = Objects.requireNonNullElse(
183+
awsRequestMetrics.getProperty(AWSRequestMetrics.Field.StatusCode),
184+
List.of()
185+
);
186+
// REQUESTED_RANGE_NOT_SATISFIED errors are expected errors due to RCO
187+
// TODO Add more expected client error codes?
188+
final long amountOfRequestRangeNotSatisfiedErrors = statusCodes.stream()
189+
.filter(e -> (Integer) e == REQUESTED_RANGE_NOT_SATISFIED.getStatus())
190+
.count();
191+
if (amountOfRequestRangeNotSatisfiedErrors > 0) {
192+
s3RepositoriesMetrics.common()
193+
.requestRangeNotSatisfiedExceptionCounter()
194+
.incrementBy(amountOfRequestRangeNotSatisfiedErrors, attributes);
195+
}
196+
}
197+
180198
s3RepositoriesMetrics.common().operationCounter().incrementBy(1, attributes);
181199
if (numberOfAwsErrors == requestCount) {
182200
s3RepositoriesMetrics.common().unsuccessfulOperationCounter().incrementBy(1, attributes);

server/src/main/java/org/elasticsearch/repositories/RepositoriesMetrics.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ public record RepositoriesMetrics(
1616
MeterRegistry meterRegistry,
1717
LongCounter requestCounter,
1818
LongCounter exceptionCounter,
19+
LongCounter requestRangeNotSatisfiedExceptionCounter,
1920
LongCounter throttleCounter,
2021
LongCounter operationCounter,
2122
LongCounter unsuccessfulOperationCounter,
@@ -28,6 +29,8 @@ public record RepositoriesMetrics(
2829

2930
public static final String METRIC_REQUESTS_TOTAL = "es.repositories.requests.total";
3031
public static final String METRIC_EXCEPTIONS_TOTAL = "es.repositories.exceptions.total";
32+
public static final String METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL =
33+
"es.repositories.exceptions.request_range_not_satisfied.total";
3134
public static final String METRIC_THROTTLES_TOTAL = "es.repositories.throttles.total";
3235
public static final String METRIC_OPERATIONS_TOTAL = "es.repositories.operations.total";
3336
public static final String METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL = "es.repositories.operations.unsuccessful.total";
@@ -40,6 +43,11 @@ public RepositoriesMetrics(MeterRegistry meterRegistry) {
4043
meterRegistry,
4144
meterRegistry.registerLongCounter(METRIC_REQUESTS_TOTAL, "repository request counter", "unit"),
4245
meterRegistry.registerLongCounter(METRIC_EXCEPTIONS_TOTAL, "repository request exception counter", "unit"),
46+
meterRegistry.registerLongCounter(
47+
METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL,
48+
"repository request RequestedRangeNotSatisfiedException counter",
49+
"unit"
50+
),
4351
meterRegistry.registerLongCounter(METRIC_THROTTLES_TOTAL, "repository request throttle counter", "unit"),
4452
meterRegistry.registerLongCounter(METRIC_OPERATIONS_TOTAL, "repository operation counter", "unit"),
4553
meterRegistry.registerLongCounter(METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, "repository unsuccessful operation counter", "unit"),

0 commit comments

Comments
 (0)