From 904099fa77e669a6bfc2d0054054703e58b35e79 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 21 May 2025 18:18:36 +0100 Subject: [PATCH 1/2] Improve get-snapshots behaviour for unreadable repositories Today if the get-snapshots API cannot access one of the repositories we return an exception with a fairly low-level message about the problem, perhaps `Could not determine repository generation from root blobs`. This message is shown verbatim in the Kibana UI so users need something a little more descriptive. With this commit we wrap the exception in one that indicates the problem in terms that users are more likely to understand. Moreover, if the user specifies the `?ignore_unavailable` option then we ignore individual snapshots that are unavailable, treating them as if they do not exist, but an unavailable repository will still cause the API to return an exception. With this commit we extend the meaning of this option to also ignore whole-repository unavailability, treating unavailable repositories as if they are empty. Relates #128208 --- .../snapshots/GetSnapshotsIT.java | 66 +++++++++++++++++++ .../get/TransportGetSnapshotsAction.java | 12 +++- 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/GetSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/GetSnapshotsIT.java index c0e63520fee9e..2068997aefd7b 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/GetSnapshotsIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/GetSnapshotsIT.java @@ -9,6 +9,7 @@ package org.elasticsearch.snapshots; +import org.apache.logging.log4j.Level; import org.apache.lucene.util.BytesRef; import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.ActionListener; @@ -40,10 +41,13 @@ import org.elasticsearch.core.Predicates; import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.repositories.RepositoryData; +import org.elasticsearch.repositories.RepositoryException; import org.elasticsearch.repositories.RepositoryMissingException; +import org.elasticsearch.repositories.blobstore.BlobStoreRepository; import org.elasticsearch.repositories.fs.FsRepository; import org.elasticsearch.search.sort.SortOrder; import org.elasticsearch.test.ESTestCase; +import org.elasticsearch.test.MockLog; import org.elasticsearch.test.XContentTestUtils; import org.elasticsearch.test.hamcrest.ElasticsearchAssertions; import org.elasticsearch.threadpool.ThreadPool; @@ -633,6 +637,68 @@ public void testRetrievingSnapshotsWhenRepositoryIsMissing() throws Exception { expectThrows(RepositoryMissingException.class, multiRepoFuture::actionGet); } + public void testRetrievingSnapshotsWhenRepositoryIsUnreadable() throws Exception { + final String repoName = randomIdentifier(); + final Path repoPath = randomRepoPath(); + createRepository( + repoName, + "fs", + Settings.builder().put("location", repoPath).put(BlobStoreRepository.CACHE_REPOSITORY_DATA.getKey(), false) + ); + createNSnapshots(repoName, randomIntBetween(1, 3)); + + try { + try (var directoryStream = Files.newDirectoryStream(repoPath)) { + for (final var directoryEntry : directoryStream) { + if (Files.isRegularFile(directoryEntry) && directoryEntry.getFileName().toString().startsWith("index-")) { + Files.writeString(directoryEntry, "invalid"); + } + } + } + + final var repositoryException = safeAwaitAndUnwrapFailure( + RepositoryException.class, + GetSnapshotsResponse.class, + l -> clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName) + .setSort(SnapshotSortKey.NAME) + .setIgnoreUnavailable(false) + .execute(l) + ); + assertEquals( + Strings.format("[%s] cannot retrieve snapshots list from this repository", repoName), + repositoryException.getMessage() + ); + assertEquals( + Strings.format("[%s] Unexpected exception when loading repository data", repoName), + repositoryException.getCause().getMessage() + ); + + MockLog.assertThatLogger( + () -> safeAwait( + l -> clusterAdmin().prepareGetSnapshots(TEST_REQUEST_TIMEOUT, repoName) + .setSort(SnapshotSortKey.NAME) + .setIgnoreUnavailable(true) + .execute(l.map(response -> { + assertThat(response.getSnapshots(), empty()); + return null; + })) + ), + TransportGetSnapshotsAction.class, + new MockLog.SeenEventExpectation( + "invalid repository warning", + TransportGetSnapshotsAction.class.getCanonicalName(), + Level.WARN, + Strings.format("failed to fetch repository data for [%s]", repoName) + ) + ); + + } finally { + safeAwait( + l -> clusterAdmin().prepareDeleteRepository(TEST_REQUEST_TIMEOUT, TEST_REQUEST_TIMEOUT, repoName).execute(l.map(v -> null)) + ); + } + } + // Create a snapshot that is guaranteed to have a unique start time and duration for tests around ordering by either. // Don't use this with more than 3 snapshots on platforms with low-resolution clocks as the durations could always collide there // causing an infinite loop diff --git a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java index 3e0411930191a..b50cba1310c42 100644 --- a/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java +++ b/server/src/main/java/org/elasticsearch/action/admin/cluster/snapshots/get/TransportGetSnapshotsAction.java @@ -38,6 +38,7 @@ import org.elasticsearch.repositories.RepositoriesService; import org.elasticsearch.repositories.Repository; import org.elasticsearch.repositories.RepositoryData; +import org.elasticsearch.repositories.RepositoryException; import org.elasticsearch.repositories.RepositoryMissingException; import org.elasticsearch.repositories.ResolvedRepositories; import org.elasticsearch.search.sort.SortOrder; @@ -285,7 +286,16 @@ private void populateResults(ActionListener listener) { ), repositoryName -> asyncRepositoryContentsListener -> SubscribableListener - .newForked(l -> maybeGetRepositoryData(repositoryName, l)) + .newForked(l -> maybeGetRepositoryData(repositoryName, l.delegateResponse((ll, e) -> { + if (ignoreUnavailable) { + logger.warn(Strings.format("failed to fetch repository data for [%s]", repositoryName), e); + ll.onResponse(RepositoryData.EMPTY); + } else { + ll.onFailure( + new RepositoryException(repositoryName, "cannot retrieve snapshots list from this repository", e) + ); + } + }))) .andThenApply(repositoryData -> { assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.MANAGEMENT); cancellableTask.ensureNotCancelled(); From 279f96156c4e543801df8588dcc83a2c1ec48dd4 Mon Sep 17 00:00:00 2001 From: David Turner Date: Wed, 21 May 2025 22:05:56 +0100 Subject: [PATCH 2/2] Update docs/changelog/128277.yaml --- docs/changelog/128277.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/128277.yaml diff --git a/docs/changelog/128277.yaml b/docs/changelog/128277.yaml new file mode 100644 index 0000000000000..6f66c4d613029 --- /dev/null +++ b/docs/changelog/128277.yaml @@ -0,0 +1,5 @@ +pr: 128277 +summary: Improve get-snapshots behaviour for unreadable repositories +area: Snapshot/Restore +type: enhancement +issues: []