Skip to content

Commit 16864e9

Browse files
authored
Retry throttled snapshot deletions (#113237)
Closes ES-8562
1 parent 837c0e8 commit 16864e9

File tree

9 files changed

+540
-41
lines changed

9 files changed

+540
-41
lines changed

docs/changelog/113237.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 113237
2+
summary: Retry throttled snapshot deletions
3+
area: Snapshot/Restore
4+
type: bug
5+
issues: []

docs/reference/snapshot-restore/repository-s3.asciidoc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,20 @@ include::repository-shared-settings.asciidoc[]
329329
`1000` which is the maximum number supported by the https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListMultipartUploads.html[AWS
330330
ListMultipartUploads API]. If set to `0`, {es} will not attempt to clean up dangling multipart uploads.
331331

332+
`throttled_delete_retry.delay_increment`::
333+
334+
(<<time-units,time value>>) This value is used as the delay before the first retry and the amount the delay is incremented by on each subsequent retry. Default is 50ms, minimum is 0ms.
335+
336+
`throttled_delete_retry.maximum_delay`::
337+
338+
(<<time-units,time value>>) This is the upper bound on how long the delays between retries will grow to. Default is 5s, minimum is 0ms.
339+
340+
`throttled_delete_retry.maximum_number_of_retries`::
341+
342+
(integer) Sets the number times to retry a throttled snapshot deletion. Defaults to `10`, minimum value is `0` which
343+
will disable retries altogether. Note that if retries are enabled in the Azure client, each of these retries
344+
comprises that many client-level retries.
345+
332346
NOTE: The option of defining client settings in the repository settings as
333347
documented below is considered deprecated, and will be removed in a future
334348
version.

modules/repository-s3/src/internalClusterTest/java/org/elasticsearch/repositories/s3/S3BlobStoreRepositoryMetricsTests.java

Lines changed: 108 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.common.collect.Iterators;
2121
import org.elasticsearch.common.settings.Settings;
2222
import org.elasticsearch.core.SuppressForbidden;
23+
import org.elasticsearch.core.TimeValue;
2324
import org.elasticsearch.plugins.PluginsService;
2425
import org.elasticsearch.repositories.RepositoriesService;
2526
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
@@ -31,13 +32,16 @@
3132
import org.elasticsearch.test.ESIntegTestCase;
3233

3334
import java.io.IOException;
35+
import java.nio.charset.StandardCharsets;
36+
import java.util.ArrayList;
3437
import java.util.Arrays;
3538
import java.util.Collections;
3639
import java.util.List;
3740
import java.util.Map;
3841
import java.util.Queue;
3942
import java.util.concurrent.LinkedBlockingQueue;
4043
import java.util.concurrent.TimeUnit;
44+
import java.util.stream.IntStream;
4145

4246
import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MILLIS_HISTOGRAM;
4347
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM;
@@ -48,9 +52,11 @@
4852
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_THROTTLES_HISTOGRAM;
4953
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_THROTTLES_TOTAL;
5054
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL;
55+
import static org.elasticsearch.repositories.s3.S3RepositoriesMetrics.METRIC_DELETE_RETRIES_HISTOGRAM;
5156
import static org.elasticsearch.rest.RestStatus.INTERNAL_SERVER_ERROR;
5257
import static org.elasticsearch.rest.RestStatus.NOT_FOUND;
5358
import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;
59+
import static org.elasticsearch.rest.RestStatus.SERVICE_UNAVAILABLE;
5460
import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS;
5561
import static org.hamcrest.Matchers.equalTo;
5662
import static org.hamcrest.Matchers.instanceOf;
@@ -61,14 +67,22 @@
6167
@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST)
6268
public class S3BlobStoreRepositoryMetricsTests extends S3BlobStoreRepositoryTests {
6369

64-
private final Queue<RestStatus> errorStatusQueue = new LinkedBlockingQueue<>();
70+
private static final S3ErrorResponse S3_SLOW_DOWN_RESPONSE = new S3ErrorResponse(SERVICE_UNAVAILABLE, """
71+
<?xml version="1.0" encoding="UTF-8"?>
72+
<Error>
73+
<Code>SlowDown</Code>
74+
<Message>This is a throttling message</Message>
75+
<Resource>/bucket/</Resource>
76+
<RequestId>4442587FB7D0A2F9</RequestId>
77+
</Error>""");
78+
private final Queue<S3ErrorResponse> errorResponseQueue = new LinkedBlockingQueue<>();
6579

6680
// Always create erroneous handler
6781
@Override
6882
protected Map<String, HttpHandler> createHttpHandlers() {
6983
return Collections.singletonMap(
7084
"/bucket",
71-
new S3StatsCollectorHttpHandler(new S3MetricErroneousHttpHandler(new S3BlobStoreHttpHandler("bucket"), errorStatusQueue))
85+
new S3StatsCollectorHttpHandler(new S3MetricErroneousHttpHandler(new S3BlobStoreHttpHandler("bucket"), errorResponseQueue))
7286
);
7387
}
7488

@@ -244,8 +258,74 @@ public void testMetricsForRequestRangeNotSatisfied() {
244258
}
245259
}
246260

261+
public void testRetrySnapshotDeleteMetricsOnEventualSuccess() throws IOException {
262+
final int maxRetries = 5;
263+
final String repositoryName = randomRepositoryName();
264+
// Disable retries in the client for this repo
265+
createRepository(
266+
repositoryName,
267+
Settings.builder()
268+
.put(repositorySettings(repositoryName))
269+
.put(S3ClientSettings.MAX_RETRIES_SETTING.getConcreteSettingForNamespace("placeholder").getKey(), 0)
270+
.put(S3Repository.RETRY_THROTTLED_DELETE_DELAY_INCREMENT.getKey(), TimeValue.timeValueMillis(10))
271+
.put(S3Repository.RETRY_THROTTLED_DELETE_MAX_NUMBER_OF_RETRIES.getKey(), maxRetries)
272+
.build(),
273+
false
274+
);
275+
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
276+
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repositoryName);
277+
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
278+
final int numberOfDeletes = randomIntBetween(1, 3);
279+
final List<Long> numberOfRetriesPerAttempt = new ArrayList<>();
280+
for (int i = 0; i < numberOfDeletes; i++) {
281+
int numFailures = randomIntBetween(1, maxRetries);
282+
numberOfRetriesPerAttempt.add((long) numFailures);
283+
IntStream.range(0, numFailures).forEach(ignored -> addErrorStatus(S3_SLOW_DOWN_RESPONSE));
284+
blobContainer.deleteBlobsIgnoringIfNotExists(
285+
randomFrom(OperationPurpose.SNAPSHOT_DATA, OperationPurpose.SNAPSHOT_METADATA),
286+
List.of(randomIdentifier()).iterator()
287+
);
288+
}
289+
List<Measurement> longHistogramMeasurement = plugin.getLongHistogramMeasurement(METRIC_DELETE_RETRIES_HISTOGRAM);
290+
assertThat(longHistogramMeasurement.stream().map(Measurement::getLong).toList(), equalTo(numberOfRetriesPerAttempt));
291+
}
292+
293+
public void testRetrySnapshotDeleteMetricsWhenRetriesExhausted() {
294+
final String repositoryName = randomRepositoryName();
295+
// Disable retries in the client for this repo
296+
int maxRetries = 3;
297+
createRepository(
298+
repositoryName,
299+
Settings.builder()
300+
.put(repositorySettings(repositoryName))
301+
.put(S3ClientSettings.MAX_RETRIES_SETTING.getConcreteSettingForNamespace("placeholder").getKey(), 0)
302+
.put(S3Repository.RETRY_THROTTLED_DELETE_DELAY_INCREMENT.getKey(), TimeValue.timeValueMillis(10))
303+
.put(S3Repository.RETRY_THROTTLED_DELETE_MAX_NUMBER_OF_RETRIES.getKey(), maxRetries)
304+
.build(),
305+
false
306+
);
307+
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
308+
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repositoryName);
309+
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);
310+
// Keep throttling past the max number of retries
311+
IntStream.range(0, maxRetries + 1).forEach(ignored -> addErrorStatus(S3_SLOW_DOWN_RESPONSE));
312+
assertThrows(
313+
IOException.class,
314+
() -> blobContainer.deleteBlobsIgnoringIfNotExists(
315+
randomFrom(OperationPurpose.SNAPSHOT_DATA, OperationPurpose.SNAPSHOT_METADATA),
316+
List.of(randomIdentifier()).iterator()
317+
)
318+
);
319+
List<Measurement> longHistogramMeasurement = plugin.getLongHistogramMeasurement(METRIC_DELETE_RETRIES_HISTOGRAM);
320+
assertThat(longHistogramMeasurement.get(0).getLong(), equalTo(3L));
321+
}
322+
247323
private void addErrorStatus(RestStatus... statuses) {
248-
errorStatusQueue.addAll(Arrays.asList(statuses));
324+
errorResponseQueue.addAll(Arrays.stream(statuses).map(S3ErrorResponse::new).toList());
325+
}
326+
327+
private void addErrorStatus(S3ErrorResponse... responses) {
328+
errorResponseQueue.addAll(Arrays.asList(responses));
249329
}
250330

251331
private long getLongCounterValue(TestTelemetryPlugin plugin, String instrumentName, Operation operation) {
@@ -275,25 +355,25 @@ private long getLongHistogramValue(TestTelemetryPlugin plugin, String instrument
275355
private static class S3MetricErroneousHttpHandler implements DelegatingHttpHandler {
276356

277357
private final HttpHandler delegate;
278-
private final Queue<RestStatus> errorStatusQueue;
358+
private final Queue<S3ErrorResponse> errorResponseQueue;
279359

280-
S3MetricErroneousHttpHandler(HttpHandler delegate, Queue<RestStatus> errorStatusQueue) {
360+
S3MetricErroneousHttpHandler(HttpHandler delegate, Queue<S3ErrorResponse> errorResponseQueue) {
281361
this.delegate = delegate;
282-
this.errorStatusQueue = errorStatusQueue;
362+
this.errorResponseQueue = errorResponseQueue;
283363
}
284364

285365
@Override
286366
public void handle(HttpExchange exchange) throws IOException {
287-
final RestStatus status = errorStatusQueue.poll();
288-
if (status == null) {
367+
final S3ErrorResponse errorResponse = errorResponseQueue.poll();
368+
if (errorResponse == null) {
289369
delegate.handle(exchange);
290-
} else if (status == INTERNAL_SERVER_ERROR) {
370+
} else if (errorResponse.status == INTERNAL_SERVER_ERROR) {
291371
// Simulate an retryable exception
292372
throw new IOException("ouch");
293373
} else {
294374
try (exchange) {
295375
drainInputStream(exchange.getRequestBody());
296-
exchange.sendResponseHeaders(status.getStatus(), -1);
376+
errorResponse.writeResponse(exchange);
297377
}
298378
}
299379
}
@@ -302,4 +382,22 @@ public HttpHandler getDelegate() {
302382
return delegate;
303383
}
304384
}
385+
386+
record S3ErrorResponse(RestStatus status, String responseBody) {
387+
388+
S3ErrorResponse(RestStatus status) {
389+
this(status, null);
390+
}
391+
392+
@SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
393+
public void writeResponse(HttpExchange exchange) throws IOException {
394+
if (responseBody != null) {
395+
byte[] responseBytes = responseBody.getBytes(StandardCharsets.UTF_8);
396+
exchange.sendResponseHeaders(status.getStatus(), responseBytes.length);
397+
exchange.getResponseBody().write(responseBytes);
398+
} else {
399+
exchange.sendResponseHeaders(status.getStatus(), -1);
400+
}
401+
}
402+
}
305403
}

0 commit comments

Comments
 (0)