Skip to content

Commit 892518d

Browse files
committed
Test remote state BWC in both CM version directions
The RemotePublicationClusterStateIT test was not deterministically exercising the case where a new-version cluster-manager writes remote cluster state that old-version nodes must deserialize. This meant backwards-incompatible serialization changes could be merged without being caught, since the test would pass whenever an old-version node happened to win the cluster-manager election. In the mixed-version (one-third upgraded) phase, the test now explicitly tests both directions: 1. Old CM writes state, new nodes read from remote store 2. New CM writes state, old nodes read from remote store To force a specific version to be cluster-manager, the test uses the voting config exclusions API to repeatedly exclude the current CM until a node of the desired version wins the election. Exclusions are cleared immediately after each re-election so no node leaves the cluster. Also changes the remote test cluster's dependency on the non-remote test cluster from dependsOn to mustRunAfter, so the remote tests can be run independently without first running the full non-remote suite. Signed-off-by: Andrew Ross <andrross@amazon.com>
1 parent cfebc67 commit 892518d

File tree

2 files changed

+115
-8
lines changed

2 files changed

+115
-8
lines changed

qa/rolling-upgrade/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ for (Version bwcVersion : BuildParams.bwcVersions.wireCompatible) {
132132
}
133133

134134
tasks.register("${remoteBaseName}#oldClusterTest", StandaloneRestIntegTestTask) {
135-
dependsOn "${baseName}#upgradedClusterTest"
135+
mustRunAfter "${baseName}#upgradedClusterTest"
136136
configureTestTask(it, remoteBaseName, bwcVersionStr, 'old_cluster', false) {
137137
doFirst { delete("${buildDir}/cluster/shared/repo/${remoteBaseName}") }
138138
}

qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/RemotePublicationClusterStateIT.java

Lines changed: 114 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,17 @@
88

99
package org.opensearch.upgrades;
1010

11+
import org.opensearch.Version;
1112
import org.opensearch.client.Request;
1213
import org.opensearch.client.Response;
1314
import org.opensearch.common.settings.Settings;
1415

16+
import java.io.IOException;
1517
import java.util.Arrays;
1618
import java.util.List;
1719
import java.util.Locale;
1820
import java.util.Map;
21+
import java.util.concurrent.TimeUnit;
1922

2023
/**
2124
* Integration tests for remote publication enabled clusters during rolling upgrades.
@@ -65,18 +68,122 @@ public void testUpgradeWithRemotePublicationEnabled() throws Exception {
6568
verifyComponentTemplateInClusterState(response);
6669
verifyComposableTemplateInClusterState(response);
6770
verifySettingsInClusterState();
71+
} else if (CLUSTER_TYPE == ClusterType.MIXED && firstMixedRound) {
72+
verifyRemotePublicationEnabled();
73+
verifyClusterState();
74+
75+
// Test both cluster-manager version scenarios to ensure remote state serialization
76+
// is backwards compatible in both directions:
77+
// 1. Old CM writes state that new nodes must read from remote store
78+
// 2. New CM writes state that old nodes must read from remote store
79+
ensureClusterManagerVersion(false);
80+
makeClusterStateChange("old_cm");
81+
ensureAllNodesHealthy();
82+
verifyClusterState();
83+
84+
ensureClusterManagerVersion(true);
85+
makeClusterStateChange("new_cm");
86+
ensureAllNodesHealthy();
87+
verifyClusterState();
6888
} else {
6989
verifyRemotePublicationEnabled();
90+
verifyClusterState();
91+
}
92+
}
7093

71-
Request request = new Request("GET", "_cluster/state");
72-
Response response = client().performRequest(request);
73-
assertOK(response);
94+
private void verifyClusterState() throws Exception {
95+
Request request = new Request("GET", "_cluster/state");
96+
Response response = client().performRequest(request);
97+
assertOK(response);
98+
verifyIndexInClusterState(response);
99+
verifyTemplateMetadataInClusterState(response);
100+
verifyComponentTemplateInClusterState(response);
101+
verifyComposableTemplateInClusterState(response);
102+
}
74103

75-
verifyIndexInClusterState(response);
76-
verifyTemplateMetadataInClusterState(response);
77-
verifyComponentTemplateInClusterState(response);
78-
verifyComposableTemplateInClusterState(response);
104+
/**
105+
* Returns true if the current cluster-manager node is running the new (upgraded) version.
106+
*/
107+
private boolean isClusterManagerOnNewVersion() throws IOException {
108+
Map<String, Object> clusterState = entityAsMap(client().performRequest(new Request("GET", "_cluster/state")));
109+
String clusterManagerNodeId = (String) clusterState.get("master_node");
110+
111+
Map<String, Object> nodesInfo = entityAsMap(client().performRequest(new Request("GET", "_nodes")));
112+
Map<String, Object> nodes = (Map<String, Object>) nodesInfo.get("nodes");
113+
Map<String, Object> cmNode = (Map<String, Object>) nodes.get(clusterManagerNodeId);
114+
Version cmVersion = Version.fromString((String) cmNode.get("version"));
115+
return cmVersion.after(UPGRADE_FROM_VERSION);
116+
}
117+
118+
/**
119+
* Ensures the cluster-manager is on the desired version by repeatedly excluding the current CM
120+
* to trigger re-elections until a node of the desired version wins.
121+
*/
122+
private void ensureClusterManagerVersion(boolean newVersion) throws Exception {
123+
String versionLabel = newVersion ? "new" : "old";
124+
long deadline = System.nanoTime() + TimeUnit.MINUTES.toNanos(1);
125+
int attempt = 0;
126+
while (isClusterManagerOnNewVersion() != newVersion) {
127+
if (System.nanoTime() > deadline) {
128+
fail("Failed to get cluster-manager on " + versionLabel + " version after 1 minute");
129+
}
130+
String cmName = getClusterManagerNodeName();
131+
logger.info("Attempt {} to get {} version CM, excluding current CM [{}]", ++attempt, versionLabel, cmName);
132+
133+
Request exclude = new Request("POST", "/_cluster/voting_config_exclusions");
134+
exclude.addParameter("node_names", cmName);
135+
exclude.addParameter("timeout", "30s");
136+
assertOK(client().performRequest(exclude));
137+
138+
// Wait for a different node to become CM
139+
assertBusy(() -> assertNotEquals(cmName, getClusterManagerNodeName()));
140+
141+
// Clear exclusion immediately so the node stays in the cluster
142+
clearVotingConfigExclusions();
79143
}
144+
logger.info("Cluster manager is on {} version: [{}]", versionLabel, getClusterManagerNodeName());
145+
}
146+
147+
private String getClusterManagerNodeName() throws IOException {
148+
Map<String, Object> clusterState = entityAsMap(client().performRequest(new Request("GET", "_cluster/state")));
149+
String cmNodeId = (String) clusterState.get("master_node");
150+
Map<String, Object> nodesInfo = entityAsMap(client().performRequest(new Request("GET", "_nodes")));
151+
Map<String, Object> nodes = (Map<String, Object>) nodesInfo.get("nodes");
152+
Map<String, Object> cmNode = (Map<String, Object>) nodes.get(cmNodeId);
153+
return (String) cmNode.get("name");
154+
}
155+
156+
/**
157+
* Makes a small cluster state change to force the cluster-manager to publish new state,
158+
* which exercises the remote state serialization/deserialization path.
159+
*/
160+
private void makeClusterStateChange(String suffix) throws IOException {
161+
Request putSettings = new Request("PUT", "_cluster/settings");
162+
putSettings.setJsonEntity(String.format(Locale.ROOT, """
163+
{
164+
"transient": {
165+
"cluster.routing.allocation.exclude._name": "nonexistent_node_%s"
166+
}
167+
}""", suffix));
168+
assertOK(client().performRequest(putSettings));
169+
}
170+
171+
private void clearVotingConfigExclusions() throws IOException {
172+
Request clearRequest = new Request("DELETE", "/_cluster/voting_config_exclusions");
173+
clearRequest.addParameter("wait_for_removal", "false");
174+
assertOK(client().performRequest(clearRequest));
175+
}
176+
177+
/**
178+
* Verifies all 3 nodes are present and the cluster is healthy. If any node failed to apply
179+
* cluster state (e.g. due to remote state deserialization errors), it will not be part of
180+
* the cluster and this check will fail.
181+
*/
182+
private void ensureAllNodesHealthy() throws Exception {
183+
ensureHealth(request -> {
184+
request.addParameter("wait_for_nodes", "3");
185+
request.addParameter("timeout", "60s");
186+
});
80187
}
81188

82189
private static void createIndexTemplate() throws Exception {

0 commit comments

Comments
 (0)