From 4d3c22b558ef0ac3ae25eab237b57717cc6d4c5b Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 7 Feb 2025 15:24:46 +0000 Subject: [PATCH 1/3] Fork post-snapshot-delete cleanup off master thread We shouldn't run the post-snapshot-delete cleanup work on the master thread, since it can be quite expensive and need not block subsequent cluster state updates. This commit forks it onto a `SNAPSHOT` thread. --- .../repositories/RepositoryData.java | 2 ++ .../blobstore/BlobStoreRepository.java | 20 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java b/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java index 2c429954f5f49..e89998dc919e6 100644 --- a/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java +++ b/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java @@ -29,6 +29,7 @@ import org.elasticsearch.snapshots.SnapshotInfo; import org.elasticsearch.snapshots.SnapshotState; import org.elasticsearch.snapshots.SnapshotsService; +import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; @@ -377,6 +378,7 @@ private static boolean isIndexToUpdateAfterRemovingSnapshots( * @return map of index to index metadata blob id to delete */ public Map> indexMetaDataToRemoveAfterRemovingSnapshots(Collection snapshotIds) { + assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.SNAPSHOT); Iterator indicesForSnapshot = indicesToUpdateAfterRemovingSnapshot(snapshotIds); final Set allRemainingIdentifiers = indexMetaDataGenerations.lookup.entrySet() .stream() diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 11386eba10196..e3ea37c6a4b20 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -1134,14 +1134,20 @@ private void runWithUniqueShardMetadataNaming(ActionListener rep ); }) - .andThen((l, newRepositoryData) -> { - l.onResponse(newRepositoryData); - // Once we have updated the repository, run the unreferenced blobs cleanup in parallel to shard-level snapshot deletion - try (var refs = new RefCountingRunnable(onCompletion)) { - cleanupUnlinkedRootAndIndicesBlobs(newRepositoryData, refs.acquireListener()); - cleanupUnlinkedShardLevelBlobs(refs.acquireListener()); + .andThen( + // writeIndexGen finishes on master-service thread so must fork here. + snapshotExecutor, + threadPool.getThreadContext(), + (l, newRepositoryData) -> { + l.onResponse(newRepositoryData); + // Once we have updated the repository, run the unreferenced blobs cleanup in parallel to shard-level snapshot + // deletion + try (var refs = new RefCountingRunnable(onCompletion)) { + cleanupUnlinkedRootAndIndicesBlobs(newRepositoryData, refs.acquireListener()); + cleanupUnlinkedShardLevelBlobs(refs.acquireListener()); + } } - }) + ) .addListener(repositoryDataUpdateListener); } From f2942d5eb0a0d8acc47e421322a0411ffc2784cc Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 7 Feb 2025 15:26:59 +0000 Subject: [PATCH 2/3] Update docs/changelog/122047.yaml --- docs/changelog/122047.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/122047.yaml diff --git a/docs/changelog/122047.yaml b/docs/changelog/122047.yaml new file mode 100644 index 0000000000000..8bd711e3b1b29 --- /dev/null +++ b/docs/changelog/122047.yaml @@ -0,0 +1,5 @@ +pr: 122047 +summary: Fork post-snapshot-delete cleanup off master thread +area: Snapshot/Restore +type: bug +issues: [] From 2e61cf984c89eea0ae39eef82f67851a10f35642 Mon Sep 17 00:00:00 2001 From: David Turner Date: Fri, 7 Feb 2025 16:58:52 +0000 Subject: [PATCH 3/3] Handle new queueing behaviour --- .../java/org/elasticsearch/snapshots/RepositoriesIT.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java index c318ebf78dd96..562f752b82220 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java @@ -508,6 +508,12 @@ public void taskSucceeded(ClusterStateTaskListener clusterStateTaskListener, Obj .orElseThrow() .queue(); + // There is one task in the queue for computing and forking the cleanup work. + assertThat(queueLength.getAsInt(), equalTo(1)); + + safeAwait(barrier); // unblock the barrier thread and let it process the queue + safeAwait(barrier); // wait for the queue to be processed + // There are indexCount (=3*snapshotPoolSize) index-deletion tasks, plus one for cleaning up the root metadata. However, the // throttled runner only enqueues one task per SNAPSHOT thread to start with, and then the eager runner adds another one. This shows // we are not spamming the threadpool with all the tasks at once, which means that other snapshot activities can run alongside this