From 9d81fc93656657147a945da1ae0320f79c308d25 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 31 Jul 2025 15:12:46 +1000 Subject: [PATCH] [Test] Put shutdown marker on the last upgraded node only (#132157) This is the same failure as observed in #129644 for which the original fix #129680 did not really work. It did not work because the the ordering of checks. The shutdown marker is removed after the cluster passes ready check so that new shards can be allocated. But the cluster cannot pass the ready check before the shards are allocated. Hence the circular dependency. In hindsight, there is no need to put shutdown record for all nodes. It is only needed on the node that upgrades the last to prevent snapshot from completion during the upgrade process. This PR does that which ensures there are always 2 nodes for hosting new shards. Resolves: #132135 Resolves: #132136 Resolves: #132137 (cherry picked from commit f39ccb500a55fd0d34ed80a0399f361bdee2f301) # Conflicts: # muted-tests.yml --- .../upgrades/RunningSnapshotIT.java | 59 ++++++++----------- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/upgrades/RunningSnapshotIT.java b/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/upgrades/RunningSnapshotIT.java index 261e92c5d7b65..3e2dfa24e7237 100644 --- a/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/upgrades/RunningSnapshotIT.java +++ b/qa/rolling-upgrade/src/javaRestTest/java/org/elasticsearch/upgrades/RunningSnapshotIT.java @@ -12,12 +12,10 @@ import com.carrotsearch.randomizedtesting.annotations.Name; import org.elasticsearch.client.Request; -import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.test.rest.ObjectPath; import java.io.IOException; -import java.util.Collection; import java.util.Map; import java.util.stream.Collectors; @@ -26,7 +24,6 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.empty; import static org.hamcrest.Matchers.equalTo; -import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.not; public class RunningSnapshotIT extends AbstractRollingUpgradeTestCase { @@ -45,6 +42,13 @@ public void testRunningSnapshotCompleteAfterUpgrade() throws Exception { .collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, entry -> entry.getValue().get("name").toString())); assertThat(nodeIdToNodeNames.values(), containsInAnyOrder("test-cluster-0", "test-cluster-1", "test-cluster-2")); + final var lastUpgradeNodeId = nodeIdToNodeNames.entrySet() + .stream() + .filter(entry -> "test-cluster-2".equals(entry.getValue())) + .map(Map.Entry::getKey) + .findFirst() + .orElseThrow(() -> new AssertionError("node id not found in " + nodeIdToNodeNames)); + if (isOldCluster()) { registerRepository(repositoryName, "fs", randomBoolean(), Settings.builder().put("location", "backup").build()); // create an index to have one shard per node @@ -54,54 +58,41 @@ public void testRunningSnapshotCompleteAfterUpgrade() throws Exception { indexDocs(indexName, between(10, 50)); } flush(indexName, true); - // Signal shutdown to prevent snapshot from being completed - putShutdownMetadata(nodeIdToNodeNames.keySet()); + // Signal shutdown for the last node to upgrade to prevent snapshot from being completed during the upgrade process + putShutdownMetadata(lastUpgradeNodeId); createSnapshot(repositoryName, snapshotName, false); assertRunningSnapshot(repositoryName, snapshotName); } else { if (isUpgradedCluster()) { - deleteShutdownMetadata(nodeIdToNodeNames.keySet()); - assertNoShutdownMetadata(nodeIdToNodeNames.keySet()); + deleteShutdownMetadata(lastUpgradeNodeId); + assertNoShutdownMetadata(lastUpgradeNodeId); ensureGreen(indexName); assertBusy(() -> assertCompletedSnapshot(repositoryName, snapshotName)); } else { - if (isFirstMixedCluster()) { - final var upgradedNodeIds = nodeIdToNodeNames.entrySet() - .stream() - .filter(entry -> "test-cluster-0".equals(entry.getValue())) - .map(Map.Entry::getKey) - .collect(Collectors.toUnmodifiableSet()); - assertThat(upgradedNodeIds, hasSize(1)); - deleteShutdownMetadata(upgradedNodeIds); - } assertRunningSnapshot(repositoryName, snapshotName); } } } - private void putShutdownMetadata(Collection nodeIds) throws IOException { - for (String nodeId : nodeIds) { - final Request putShutdownRequest = new Request("PUT", "/_nodes/" + nodeId + "/shutdown"); - putShutdownRequest.setJsonEntity(""" - { - "type": "remove", - "reason": "test" - }"""); - client().performRequest(putShutdownRequest); - } + private void putShutdownMetadata(String nodeId) throws IOException { + final Request putShutdownRequest = new Request("PUT", "/_nodes/" + nodeId + "/shutdown"); + putShutdownRequest.setJsonEntity(""" + { + "type": "remove", + "reason": "test" + }"""); + client().performRequest(putShutdownRequest); } - private void deleteShutdownMetadata(Collection nodeIds) throws IOException { - for (String nodeId : nodeIds) { - final Request request = new Request("DELETE", "/_nodes/" + nodeId + "/shutdown"); - request.addParameter(IGNORE_RESPONSE_CODES_PARAM, "404"); - client().performRequest(request); - } + private void deleteShutdownMetadata(String nodeId) throws IOException { + final Request request = new Request("DELETE", "/_nodes/" + nodeId + "/shutdown"); + request.addParameter(IGNORE_RESPONSE_CODES_PARAM, "404"); + client().performRequest(request); } - private void assertNoShutdownMetadata(Collection nodeIds) throws IOException { + private void assertNoShutdownMetadata(String nodeId) throws IOException { final ObjectPath responsePath = assertOKAndCreateObjectPath( - client().performRequest(new Request("GET", "/_nodes/" + Strings.collectionToCommaDelimitedString(nodeIds) + "/shutdown")) + client().performRequest(new Request("GET", "/_nodes/" + nodeId + "/shutdown")) ); assertThat(responsePath.evaluate("nodes"), empty()); }