elastic · ywangd · Jul 19, 2025 · Jul 20, 2025 · Jul 20, 2025 · Jul 20, 2025
diff --git a/docs/changelog/131592.yaml b/docs/changelog/131592.yaml
@@ -0,0 +1,5 @@
+pr: 131592
+summary: "[PoC] Limited number of shard snapshot in INIT state per node"
+area: Snapshot/Restore
+type: enhancement
+issues: []
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotAndRelocationIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/SnapshotAndRelocationIT.java
@@ -0,0 +1,129 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+package org.elasticsearch.snapshots;
+
+import org.elasticsearch.cluster.SnapshotsInProgress;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.core.CheckedRunnable;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.indices.IndicesService;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.snapshots.mockstore.MockRepository;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.test.transport.MockTransportService;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.IntStream;
+import java.util.stream.StreamSupport;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.is;
+
+@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST, numDataNodes = 0)
+public class SnapshotAndRelocationIT extends AbstractSnapshotIntegTestCase {
+
+    @Override
+    protected Collection<Class<? extends Plugin>> nodePlugins() {
+        return Arrays.asList(MockTransportService.TestPlugin.class, MockRepository.Plugin.class);
+    }
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
+        return Settings.builder()
+            .put(super.nodeSettings(nodeOrdinal, otherSettings))
+            .put(SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING.getKey(), 1)
+            .build();
+    }
+
+    public void testLimitingInitAndRelocationForAssignedQueueShards() throws Exception {
+        final String masterNode = internalCluster().startMasterOnlyNode();
+        final String dataNodeA = internalCluster().startDataOnlyNode();
+        final String dataNodeB = internalCluster().startDataOnlyNode();
+        final String repoName = "test-repo";
+        createRepository(repoName, "mock");
+
+        final AtomicBoolean delayOnce = new AtomicBoolean(false);
+        final AtomicReference<CheckedRunnable<Exception>> delayedAction = new AtomicReference<>();
+        final var delayedActionSetLatch = new CountDownLatch(1);
+        MockTransportService.getInstance(masterNode)
+            .addRequestHandlingBehavior(SnapshotsService.UPDATE_SNAPSHOT_STATUS_ACTION_NAME, (handler, request, channel, task) -> {
+                if (delayOnce.compareAndSet(false, true)) {
+                    delayedAction.set(() -> handler.messageReceived(request, channel, task));
+                    delayedActionSetLatch.countDown();
+                } else {
+                    handler.messageReceived(request, channel, task);
+                }
+            });
+
+        final var numIndices = between(2, 4);
+        final var indexNames = IntStream.range(0, numIndices).mapToObj(i -> "index-" + i).toList();
+
+        for (var indexName : indexNames) {
+            createIndex(indexName, indexSettings(1, 0).put("index.routing.allocation.include._name", dataNodeA).build());
+            indexRandomDocs(indexName, between(10, 42));
+        }
+        ensureGreen();
+
+        final var future = startFullSnapshot(repoName, "snapshot");
+        safeAwait(delayedActionSetLatch);
+
+        final var clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class);
+        final SnapshotsInProgress.Entry snapshot = SnapshotsInProgress.get(clusterService.state()).asStream().iterator().next();
+        logger.info("--> snapshot=[{}]", snapshot);
+        final var shards = snapshot.shards();
+        assertThat(shards.size(), equalTo(numIndices));
+
+        final var dataNodeAId = getNodeId(dataNodeA);
+        final var initShards = shards.entrySet()
+            .stream()
+            .filter(entry -> entry.getValue().state() == SnapshotsInProgress.ShardState.INIT)
+            .peek(entry -> assertThat(entry.getValue().nodeId(), equalTo(dataNodeAId)))
+            .map(Map.Entry::getKey)
+            .toList();
+        logger.info("--> init shards [{}]", initShards);
+        assertThat(initShards.size(), equalTo(1));
+
+        final var assignedQueuedShards = shards.entrySet()
+            .stream()
+            .filter(entry -> entry.getValue().isAssignedQueued())
+            .peek(entry -> assertThat(entry.getValue().nodeId(), equalTo(dataNodeAId)))
+            .map(Map.Entry::getKey)
+            .toList();
+        logger.info("--> assigned queued shards [{}]", assignedQueuedShards);
+        assertThat(assignedQueuedShards.size(), equalTo(numIndices - 1));
+
+        // Relocate indices that are assigned queued
+        final String[] indices = assignedQueuedShards.stream().map(ShardId::getIndexName).toArray(String[]::new);
+        logger.info("--> relocate indices [{}]", Arrays.toString(indices));
+        updateIndexSettings(Settings.builder().put("index.routing.allocation.include._name", dataNodeB), indices);
+        ensureGreen(indices);
+
+        final var dataNodeBIndicesService = internalCluster().getInstance(IndicesService.class, dataNodeB);
+        for (var shardId : assignedQueuedShards) {
+            assertTrue(
+                "indices: "
+                    + StreamSupport.stream(dataNodeBIndicesService.spliterator(), false)
+                        .map(indexService -> indexService.index().getName())
+                        .toList(),
+                dataNodeBIndicesService.hasIndex(shardId.getIndex())
+            );
+        }
+
+        assertThat(future.isDone(), is(false));
+        logger.info("--> run delayed action");
+        delayedAction.get().run();
+        assertSuccessful(future);
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java
@@ -58,6 +58,7 @@
 import java.util.Map;
 import java.util.Objects;
 import java.util.Set;
+import java.util.function.BiConsumer;
 import java.util.stream.Stream;
 
 import static org.elasticsearch.repositories.ProjectRepo.PROJECT_REPO_SERIALIZER;
@@ -197,6 +198,10 @@ public int count() {
         return count;
     }
 
+    public Set<ProjectRepo> repos() {
+        return entries.keySet();
+    }
+
     public Iterable<List<Entry>> entriesByRepo() {
         return () -> Iterators.map(entries.values().iterator(), byRepo -> byRepo.entries);
     }
@@ -504,7 +509,7 @@ private static boolean assertShardStateConsistent(
         int shardId,
         ShardSnapshotStatus shardSnapshotStatus
     ) {
-        if (shardSnapshotStatus.isActive()) {
+        if (shardSnapshotStatus.isActiveOrAssignedQueued()) {
             Tuple<String, Integer> plainShardId = Tuple.tuple(indexName, shardId);
             assert assignedShards.add(plainShardId) : plainShardId + " is assigned twice in " + entries;
             assert queuedShards.contains(plainShardId) == false : plainShardId + " is queued then assigned in " + entries;
@@ -752,6 +757,20 @@ public static ShardSnapshotStatus success(String nodeId, ShardSnapshotResult sha
             return new ShardSnapshotStatus(nodeId, ShardState.SUCCESS, shardSnapshotResult.getGeneration(), null, shardSnapshotResult);
         }
 
+        @SuppressForbidden(reason = "using a private constructor within the same file")
+        public static ShardSnapshotStatus assignedQueued(String nodeId, ShardGeneration generation) {
+            return new ShardSnapshotStatus(nodeId, ShardState.QUEUED, generation, null, null);
+        }
+
+        public boolean isAssignedQueued() {
+            // generation can still be null if previous shard snapshots all failed
+            return state == ShardState.QUEUED && nodeId != null;
+        }
+
+        public boolean isUnassignedQueued() {
+            return this == UNASSIGNED_QUEUED || (state == ShardState.QUEUED && generation == null && nodeId == null);
+        }
+
         public ShardSnapshotStatus(
             @Nullable String nodeId,
             ShardState state,
@@ -772,8 +791,16 @@ private boolean assertConsistent() {
             assert state.failed() == false || reason != null;
             assert (state != ShardState.INIT && state != ShardState.WAITING && state != ShardState.PAUSED_FOR_NODE_REMOVAL)
                 || nodeId != null : "Null node id for state [" + state + "]";
-            assert state != ShardState.QUEUED || (nodeId == null && generation == null && reason == null)
-                : "Found unexpected non-null values for queued state shard nodeId[" + nodeId + "][" + generation + "][" + reason + "]";
+            assert state != ShardState.QUEUED || (isUnassignedQueued() || isAssignedQueued())
+                : "Found unexpected shard state=["
+                    + state
+                    + "], nodeId=["
+                    + nodeId
+                    + "], generation=["
+                    + generation
+                    + "], reason=["
+                    + reason
+                    + "]";
             assert state == ShardState.SUCCESS || shardSnapshotResult == null;
             assert shardSnapshotResult == null || shardSnapshotResult.getGeneration().equals(generation)
                 : "generation [" + generation + "] does not match result generation [" + shardSnapshotResult.getGeneration() + "]";
@@ -787,7 +814,7 @@ public static ShardSnapshotStatus readFrom(StreamInput in) throws IOException {
             final ShardGeneration generation = in.readOptionalWriteable(ShardGeneration::new);
             final String reason = in.readOptionalString();
             final ShardSnapshotResult shardSnapshotResult = in.readOptionalWriteable(ShardSnapshotResult::new);
-            if (state == ShardState.QUEUED) {
+            if (state == ShardState.QUEUED && nodeId == null && generation == null) {
                 return UNASSIGNED_QUEUED;
             }
             return new ShardSnapshotStatus(nodeId, state, generation, reason, shardSnapshotResult);
@@ -819,13 +846,18 @@ public ShardSnapshotResult shardSnapshotResult() {
          * ({@link ShardState#INIT} or {@link ShardState#ABORTED}) or about to write to it in state {@link ShardState#WAITING} or
          * {@link ShardState#PAUSED_FOR_NODE_REMOVAL}.
          */
+        // TODO: review its usage again
         public boolean isActive() {
             return switch (state) {
                 case INIT, ABORTED, WAITING, PAUSED_FOR_NODE_REMOVAL -> true;
                 case SUCCESS, FAILED, MISSING, QUEUED -> false;
             };
         }
 
+        public boolean isActiveOrAssignedQueued() {
+            return isActive() || isAssignedQueued();
+        }
+
         @Override
         public void writeTo(StreamOutput out) throws IOException {
             out.writeOptionalString(nodeId);
@@ -1199,21 +1231,40 @@ public Entry withClones(Map<RepositoryShardId, ShardSnapshotStatus> updatedClone
          * @return aborted snapshot entry or {@code null} if entry can be removed from the cluster state directly
          */
         @Nullable
-        public Entry abort() {
+        public Entry abort(String localNodeId, BiConsumer<ShardId, ShardSnapshotStatus> abortedAssignedQueuedShardConsumer) {
             final Map<ShardId, ShardSnapshotStatus> shardsBuilder = new HashMap<>();
             boolean completed = true;
             boolean allQueued = true;
             for (Map.Entry<ShardId, ShardSnapshotStatus> shardEntry : shards.entrySet()) {
                 ShardSnapshotStatus status = shardEntry.getValue();
-                allQueued &= status.state() == ShardState.QUEUED;
+                final var isAssignedQueued = status.isAssignedQueued();
+                allQueued &= (status.state() == ShardState.QUEUED && isAssignedQueued == false);
                 if (status.state().completed() == false) {
                     final String nodeId = status.nodeId();
-                    status = new ShardSnapshotStatus(
-                        nodeId,
-                        nodeId == null ? ShardState.FAILED : ShardState.ABORTED,
-                        status.generation(),
-                        "aborted by snapshot deletion"
-                    );
+                    if (isAssignedQueued == false) {
+                        status = new ShardSnapshotStatus(
+                            nodeId,
+                            nodeId == null ? ShardState.FAILED : ShardState.ABORTED,
+                            status.generation(),
+                            "aborted by snapshot deletion"
+                        );
+                    } else {
+                        assert isClone() == false
+                            : "The state queued with generation should not be possible for a clone entry [" + this + "]";
+                        final String reason = "assigned-queued aborted by snapshot deletion";
+                        status = new ShardSnapshotStatus(
+                            nodeId,
+                            // Assigned QUEUED transitions to ABORTED (incomplete) and is completed by a separate cluster state update
+                            ShardState.ABORTED,
+                            status.generation(),
+                            reason
+                        );
+                        // Accumulate the updates needed to complete the aborted QUEUED with generation shard snapshots
+                        abortedAssignedQueuedShardConsumer.accept(
+                            shardEntry.getKey(),
+                            new ShardSnapshotStatus(localNodeId, ShardState.FAILED, status.generation, reason)
+                        );
+                    }
                 }
                 completed &= status.state().completed();
                 shardsBuilder.put(shardEntry.getKey(), status);

diff --git a/...elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java b/...elasticsearch/cluster/routing/allocation/decider/SnapshotInProgressAllocationDecider.java
@@ -118,6 +118,10 @@ private static Decision canMove(ShardRouting shardRouting, RoutingAllocation all
                     }
                 }
 
+                if (shardSnapshotStatus.isAssignedQueued()) {
+                    continue;
+                }
+
                 return allocation.decision(
                     Decision.THROTTLE,
                     NAME,

@@ -578,6 +578,7 @@ public void apply(Settings value, Settings current, Settings previous) {
         HandshakingTransportAddressConnector.PROBE_CONNECT_TIMEOUT_SETTING,
         HandshakingTransportAddressConnector.PROBE_HANDSHAKE_TIMEOUT_SETTING,
         SnapshotsService.MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING,
+        SnapshotsService.SHARD_SNAPSHOT_PER_NODE_LIMIT_SETTING,
         RestoreService.REFRESH_REPO_UUID_ON_RESTORE_SETTING,
         FsHealthService.ENABLED_SETTING,
         FsHealthService.REFRESH_INTERVAL_SETTING,

diff --git a/server/src/main/java/org/elasticsearch/snapshots/InFlightShardSnapshotStates.java b/server/src/main/java/org/elasticsearch/snapshots/InFlightShardSnapshotStates.java
@@ -67,7 +67,8 @@ private static void addStateInformation(
         int shardId,
         String indexName
     ) {
-        if (shardState.isActive()) {
+        // Both active or assigned queued means the shard is meant to be the one with actions if node capacity allows it
+        if (shardState.isActiveOrAssignedQueued()) {
             busyIds.computeIfAbsent(indexName, k -> new HashSet<>()).add(shardId);
             assert assertGenerationConsistency(generations, indexName, shardId, shardState.generation());
         } else if (shardState.state() == SnapshotsInProgress.ShardState.SUCCESS) {