[9.2] Fix PAUSED_FOR_NODE_REMOVAL shard blocking QUEUED promotion (elastic#142637) (elastic#142673)

ywangd · web-flow · commit 8c5c96a840e7 · 2026-02-19T06:43:49.000+01:00
* Fix PAUSED_FOR_NODE_REMOVAL shard blocking QUEUED promotion (elastic#142637) When a snapshot with a PAUSED_FOR_NODE_REMOVAL shard is deleted, the abort previously transitioned it directly to FAILED. This bypassed the normal state propagation that promotes QUEUED shards, allowing a subsequently created snapshot to incorrectly receive INIT instead of QUEUED for the same shard, violating the ordering invariant. Change abort to transition PAUSED_FOR_NODE_REMOVAL to ABORTED (an active state) so that new snapshots correctly get QUEUED. The data node detects the PAUSED local status on an ABORTED entry and reports FAILED to the master, which triggers QUEUED promotion through the existing state propagation. (cherry picked from commit 3393f3a) * fix imports
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java b/server/src/internalClusterTest/java/org/elasticsearch/snapshots/ConcurrentSnapshotsIT.java
@@ -26,6 +26,8 @@
 import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
 import org.elasticsearch.cluster.SnapshotsInProgress;
 import org.elasticsearch.cluster.metadata.MetadataDeleteIndexService;
+import org.elasticsearch.cluster.metadata.ProjectId;
+import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
@@ -2288,6 +2290,76 @@ private ActionFuture<CreateSnapshotResponse> startFullSnapshotFromMasterClient(S
             .execute();
     }
 
+    /**
+     * This test ensures that deleting a snapshot with paused shards works fine when there are shards queued behind it
+     * as well as creating new snapshots concurrently. The snapshot state machine should propagate correctly.
+     */
+    public void testConcurrentDeletePausedSnapshotAndCreateSnapshot() throws Exception {
+        final String masterNode = internalCluster().startMasterOnlyNode();
+        final String dataNode = internalCluster().startDataOnlyNode();
+        ensureStableCluster(2);
+        final String repoName = randomRepoName();
+        createRepository(repoName, "mock");
+
+        final String indexName = randomIndexName();
+        createIndexWithContent(indexName, indexSettingsNoReplicas(1).build());
+
+        final var snap0 = randomSnapshotName();
+        final var snap1 = randomSnapshotName();
+        final var snap2 = randomSnapshotName();
+        final var snap3 = randomSnapshotName();
+
+        // snap0: block finalization on the master so that later deletion stays WAITING
+        blockMasterOnWriteIndexFile(repoName);
+        final var snap0Future = startFullSnapshot(repoName, snap0);
+        waitForBlock(masterNode, repoName);
+
+        // Add data so that snap1's shard snapshot has new segment files to write (allowing data node block)
+        indexDoc(indexName, "another_id", "foo", "bar");
+
+        // snap1: block on data node, then pause for node removal
+        final var snap1Future = startFullSnapshotBlockedOnDataNode(snap1, repoName, dataNode);
+
+        // Shutdown and unblock the data node so that the shard snapshot moves to PAUSED_FOR_NODE_REMOVAL
+        final var clusterService = internalCluster().getCurrentMasterNodeInstance(ClusterService.class);
+        putShutdownForRemovalMetadata(dataNode, clusterService);
+        unblockNode(repoName, dataNode);
+        awaitClusterState(state -> {
+            for (var entry : SnapshotsInProgress.get(state).forRepo(ProjectId.DEFAULT, repoName)) {
+                if (entry.snapshot().getSnapshotId().getName().equals(snap1)) {
+                    for (var shard : entry.shards().values()) {
+                        if (shard.state() == SnapshotsInProgress.ShardState.PAUSED_FOR_NODE_REMOVAL) {
+                            return true;
+                        }
+                    }
+                }
+            }
+            return false;
+        });
+
+        // snap2: shard is QUEUED behind snap1's active PAUSED_FOR_NODE_REMOVAL shard
+        final var snap2Future = startFullSnapshot(repoName, snap2);
+        awaitNumberOfSnapshotsInProgress(3);
+
+        // Delete snap1: PAUSED_FOR_NODE_REMOVAL → ABORTED in abort(), deletion is WAITING because snap0 is still finalizing
+        final ActionFuture<AcknowledgedResponse> deleteFuture = startDeleteSnapshot(repoName, snap1);
+        awaitNDeletionsInProgress(1);
+
+        // snap3: start yet another snapshot and its shard correctly gets QUEUED
+        final var snap3Future = startFullSnapshot(repoName, snap3);
+
+        // Clean up: clear shutdown so shard snapshots can run, then unblock finalization
+        clearShutdownMetadata(clusterService);
+        unblockNode(repoName, masterNode);
+
+        // All snapshots except the deleted on should complete successfully
+        assertSuccessful(snap0Future);
+        assertThat(snap1Future.get().getSnapshotInfo().state(), is(SnapshotState.FAILED));
+        assertThat(deleteFuture.get().isAcknowledged(), is(true));
+        assertSuccessful(snap2Future);
+        assertSuccessful(snap3Future);
+    }
+
     private void createIndexWithContent(String indexName, String nodeInclude, String nodeExclude) {
         createIndexWithContent(
             indexName,
diff --git a/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java b/server/src/main/java/org/elasticsearch/cluster/SnapshotsInProgress.java
@@ -1240,9 +1240,7 @@ public Entry abort() {
                 allQueued &= status.state() == ShardState.QUEUED;
                 if (status.state().completed() == false) {
                     final String nodeId = status.nodeId();
-                    final var newState = (nodeId == null || status.state() == ShardState.PAUSED_FOR_NODE_REMOVAL)
-                        ? ShardState.FAILED
-                        : ShardState.ABORTED;
+                    final var newState = nodeId == null ? ShardState.FAILED : ShardState.ABORTED;
                     status = new ShardSnapshotStatus(nodeId, newState, status.generation(), "aborted by snapshot deletion");
                 }
                 completed &= status.state().completed();
diff --git a/server/src/main/java/org/elasticsearch/index/snapshots/IndexShardSnapshotStatus.java b/server/src/main/java/org/elasticsearch/index/snapshots/IndexShardSnapshotStatus.java
@@ -242,6 +242,15 @@ public synchronized void moveToFailed(final long endTime, final String failure)
         }
     }
 
+    /**
+     * Transition from {@link Stage#PAUSED} to {@link Stage#FAILURE}. The abort listeners and timing fields were already
+     * handled during the earlier PAUSING → PAUSED transition, so only the stage needs updating.
+     */
+    public void moveFromPausedToFailed() {
+        final boolean moved = stage.compareAndSet(Stage.PAUSED, Stage.FAILURE);
+        assert moved : "expected stage PAUSED but got " + stage.get();
+    }
+
     public ShardGeneration generation() {
         return generation.get();
     }
diff --git a/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java b/server/src/main/java/org/elasticsearch/snapshots/SnapshotShardsService.java
@@ -357,6 +357,19 @@ private void handleUpdatedSnapshotsInProgressEntry(String localNodeId, boolean r
                                 (outcomeInfoString) -> {}
                             );
                         }
+                    } else if (snapshotStatus.isPaused()) {
+                        // Shard was paused for node removal then aborted by snapshot deletion.
+                        // abortIfNotCompleted won't transition from PAUSED, so report FAILED directly.
+                        logger.debug("snapshot [{}] is deleted after PAUSED, updating shard snapshot for {} to FAILED", snapshot, sid);
+                        snapshotStatus.moveFromPausedToFailed();
+                        notifyUnsuccessfulSnapshotShard(
+                            snapshot,
+                            sid,
+                            ShardState.FAILED,
+                            shard.getValue().reason(),
+                            shard.getValue().generation(),
+                            (outcomeInfoString) -> {}
+                        );
                     } else {
                         snapshotStatus.abortIfNotCompleted("snapshot has been aborted", notifyOnAbortTaskRunner::enqueueTask);
                     }

Original file line number	Diff line number	Diff line change
`@@ -242,6 +242,15 @@ public synchronized void moveToFailed(final long endTime, final String failure)`
`242`	`242`	`}`
`243`	`243`	`}`
`244`	`244`
	`245`	`+ /**`
	`246`	`+ * Transition from {@link Stage#PAUSED} to {@link Stage#FAILURE}. The abort listeners and timing fields were already`
	`247`	`+ * handled during the earlier PAUSING → PAUSED transition, so only the stage needs updating.`
	`248`	`+ */`
	`249`	`+ public void moveFromPausedToFailed() {`
	`250`	`+ final boolean moved = stage.compareAndSet(Stage.PAUSED, Stage.FAILURE);`
	`251`	`+ assert moved : "expected stage PAUSED but got " + stage.get();`
	`252`	`+ }`
	`253`	`+`
`245`	`254`	`public ShardGeneration generation() {`
`246`	`255`	`return generation.get();`
`247`	`256`	`}`