From 3f96729f02687c55b85c502e12257450e3252d4b Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 17 Feb 2025 08:51:30 +0000 Subject: [PATCH 1/3] Fork post-snapshot-delete cleanup off master thread We shouldn't run the post-snapshot-delete cleanup work on the master thread, since it can be quite expensive and need not block subsequent cluster state updates. This commit forks it onto a `SNAPSHOT` thread. --- docs/changelog/122047.yaml | 5 +++++ .../test/snapshot.delete/10_basic.yml | 7 +++++++ .../snapshots/RepositoriesIT.java | 6 ++++++ .../repositories/RepositoryData.java | 2 ++ .../blobstore/BlobStoreRepository.java | 20 ++++++++++++------- 5 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 docs/changelog/122047.yaml diff --git a/docs/changelog/122047.yaml b/docs/changelog/122047.yaml new file mode 100644 index 0000000000000..8bd711e3b1b29 --- /dev/null +++ b/docs/changelog/122047.yaml @@ -0,0 +1,5 @@ +pr: 122047 +summary: Fork post-snapshot-delete cleanup off master thread +area: Snapshot/Restore +type: bug +issues: [] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/snapshot.delete/10_basic.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/snapshot.delete/10_basic.yml index 5a60f76f6da2c..84e0d5b3e524a 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/snapshot.delete/10_basic.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/snapshot.delete/10_basic.yml @@ -68,3 +68,10 @@ setup: wait_for_completion: false - match: { acknowledged: true } + + # now create another snapshot just to ensure that the async delete finishes before the test cleanup runs: + - do: + snapshot.create: + repository: test_repo_create_1 + snapshot: barrier_snapshot + wait_for_completion: true diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java index c318ebf78dd96..562f752b82220 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/RepositoriesIT.java @@ -508,6 +508,12 @@ public void taskSucceeded(ClusterStateTaskListener clusterStateTaskListener, Obj .orElseThrow() .queue(); + // There is one task in the queue for computing and forking the cleanup work. + assertThat(queueLength.getAsInt(), equalTo(1)); + + safeAwait(barrier); // unblock the barrier thread and let it process the queue + safeAwait(barrier); // wait for the queue to be processed + // There are indexCount (=3*snapshotPoolSize) index-deletion tasks, plus one for cleaning up the root metadata. However, the // throttled runner only enqueues one task per SNAPSHOT thread to start with, and then the eager runner adds another one. This shows // we are not spamming the threadpool with all the tasks at once, which means that other snapshot activities can run alongside this diff --git a/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java b/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java index 2c429954f5f49..e89998dc919e6 100644 --- a/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java +++ b/server/src/main/java/org/elasticsearch/repositories/RepositoryData.java @@ -29,6 +29,7 @@ import org.elasticsearch.snapshots.SnapshotInfo; import org.elasticsearch.snapshots.SnapshotState; import org.elasticsearch.snapshots.SnapshotsService; +import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentParser; @@ -377,6 +378,7 @@ private static boolean isIndexToUpdateAfterRemovingSnapshots( * @return map of index to index metadata blob id to delete */ public Map> indexMetaDataToRemoveAfterRemovingSnapshots(Collection snapshotIds) { + assert ThreadPool.assertCurrentThreadPool(ThreadPool.Names.SNAPSHOT); Iterator indicesForSnapshot = indicesToUpdateAfterRemovingSnapshot(snapshotIds); final Set allRemainingIdentifiers = indexMetaDataGenerations.lookup.entrySet() .stream() diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java index 11386eba10196..e3ea37c6a4b20 100644 --- a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java +++ b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java @@ -1134,14 +1134,20 @@ private void runWithUniqueShardMetadataNaming(ActionListener rep ); }) - .andThen((l, newRepositoryData) -> { - l.onResponse(newRepositoryData); - // Once we have updated the repository, run the unreferenced blobs cleanup in parallel to shard-level snapshot deletion - try (var refs = new RefCountingRunnable(onCompletion)) { - cleanupUnlinkedRootAndIndicesBlobs(newRepositoryData, refs.acquireListener()); - cleanupUnlinkedShardLevelBlobs(refs.acquireListener()); + .andThen( + // writeIndexGen finishes on master-service thread so must fork here. + snapshotExecutor, + threadPool.getThreadContext(), + (l, newRepositoryData) -> { + l.onResponse(newRepositoryData); + // Once we have updated the repository, run the unreferenced blobs cleanup in parallel to shard-level snapshot + // deletion + try (var refs = new RefCountingRunnable(onCompletion)) { + cleanupUnlinkedRootAndIndicesBlobs(newRepositoryData, refs.acquireListener()); + cleanupUnlinkedShardLevelBlobs(refs.acquireListener()); + } } - }) + ) .addListener(repositoryDataUpdateListener); } From 3f7662ea3748224599fe5471ac0f3bc0b61ce604 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 17 Feb 2025 09:03:18 +0000 Subject: [PATCH 2/3] Update docs/changelog/122731.yaml --- docs/changelog/122731.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/122731.yaml diff --git a/docs/changelog/122731.yaml b/docs/changelog/122731.yaml new file mode 100644 index 0000000000000..afde587a9538b --- /dev/null +++ b/docs/changelog/122731.yaml @@ -0,0 +1,5 @@ +pr: 122731 +summary: Fork post-snapshot-delete cleanup off master thread +area: Snapshot/Restore +type: bug +issues: [] From f79e3cf05f1513f84b91ed54181554c2cd7f6317 Mon Sep 17 00:00:00 2001 From: David Turner Date: Mon, 17 Feb 2025 10:38:47 +0000 Subject: [PATCH 3/3] Remove legacy changelog --- docs/changelog/122047.yaml | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 docs/changelog/122047.yaml diff --git a/docs/changelog/122047.yaml b/docs/changelog/122047.yaml deleted file mode 100644 index 8bd711e3b1b29..0000000000000 --- a/docs/changelog/122047.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pr: 122047 -summary: Fork post-snapshot-delete cleanup off master thread -area: Snapshot/Restore -type: bug -issues: []