Skip to content

Commit af96109

Browse files
authored
Check no unassigned shards even if the node already left (#94722)
We are currently checking if there are any unassigned shards on the shutting down node before returning SingleNodeShutdownMetadata.Status.COMPLETE. However this check is only triggered if the node is still running. If the node unexpectedly exists the cluster or dies during the migration then we are at risk of loosing shard data of those unassigned shards. This change performs the check even if the node is no longer part of the cluster.
1 parent 5d5ab5b commit af96109

File tree

4 files changed

+62
-58
lines changed

4 files changed

+62
-58
lines changed

docs/changelog/94722.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 94722
2+
summary: Check no unassigned shards even if the node already left
3+
area: Infra/Core
4+
type: bug
5+
issues: []

x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/NodeShutdownShardsIT.java

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -414,7 +414,7 @@ public void testNodeShutdownWithUnassignedShards() throws Exception {
414414
final String nodeA = internalCluster().startNode();
415415
final String nodeAId = getNodeId(nodeA);
416416

417-
createIndex("index", 1, 1);
417+
createIndex("index", 1, 0);
418418

419419
ensureYellow("index");
420420

@@ -423,18 +423,10 @@ public void testNodeShutdownWithUnassignedShards() throws Exception {
423423
final String nodeBId = getNodeId(nodeB);
424424
ensureGreen("index");
425425

426-
updateClusterSettings(Settings.builder().put("cluster.routing.allocation.enable", "none"));
427-
428-
assertThat(client().admin().indices().prepareFlush("index").get().getSuccessfulShards(), equalTo(2));
429-
assertThat(client().admin().indices().prepareRefresh("index").get().getSuccessfulShards(), equalTo(2));
430-
431-
internalCluster().restartNode(nodeA);
432-
internalCluster().restartNode(nodeB);
433-
434-
assertThat(client().admin().cluster().prepareHealth("index").get().getUnassignedShards(), equalTo(1));
435-
436426
putNodeShutdown(nodeAId, SingleNodeShutdownMetadata.Type.REMOVE, null);
437427

428+
internalCluster().stopNode(nodeA);
429+
438430
assertBusy(() -> assertNodeShutdownStatus(nodeAId, STALLED));
439431
}
440432

x-pack/plugin/shutdown/src/main/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusAction.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -189,12 +189,6 @@ static ShutdownShardMigrationStatus shardMigrationStatus(
189189
);
190190
}
191191

192-
// The node is in `DiscoveryNodes`, but not `RoutingNodes` - so there are no shards assigned to it. We're done.
193-
if (currentState.getRoutingNodes().node(nodeId) == null) {
194-
// We don't know about that node
195-
return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.COMPLETE, 0);
196-
}
197-
198192
final RoutingAllocation allocation = new RoutingAllocation(
199193
allocationDeciders,
200194
currentState,
@@ -233,6 +227,12 @@ static ShutdownShardMigrationStatus shardMigrationStatus(
233227
);
234228
}
235229

230+
// The node is in `DiscoveryNodes`, but not `RoutingNodes` - so there are no shards assigned to it. We're done.
231+
if (currentState.getRoutingNodes().node(nodeId) == null) {
232+
// We don't know about that node
233+
return new ShutdownShardMigrationStatus(SingleNodeShutdownMetadata.Status.COMPLETE, 0);
234+
}
235+
236236
// Check if there are any shards currently on this node, and if there are any relocating shards
237237
int startedShards = currentState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.STARTED);
238238
int relocatingShards = currentState.getRoutingNodes().node(nodeId).numberOfShardsWithState(ShardRoutingState.RELOCATING);

x-pack/plugin/shutdown/src/test/java/org/elasticsearch/xpack/shutdown/TransportGetShutdownStatusActionTests.java

Lines changed: 48 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@
4747
import org.hamcrest.Matcher;
4848
import org.junit.Before;
4949

50-
import java.util.Collections;
51-
import java.util.HashMap;
5250
import java.util.List;
5351
import java.util.Map;
5452
import java.util.Set;
5553
import java.util.concurrent.atomic.AtomicReference;
54+
import java.util.function.Function;
5655

56+
import static java.util.stream.Collectors.toMap;
5757
import static org.hamcrest.Matchers.allOf;
5858
import static org.hamcrest.Matchers.containsString;
5959
import static org.hamcrest.Matchers.equalTo;
@@ -139,7 +139,7 @@ public Decision canRebalance(RoutingAllocation allocation) {
139139
*/
140140
public void testEmptyCluster() {
141141
RoutingTable routingTable = RoutingTable.EMPTY_ROUTING_TABLE;
142-
ClusterState state = createTestClusterState(routingTable, Collections.emptyList(), SingleNodeShutdownMetadata.Type.REMOVE);
142+
ClusterState state = createTestClusterState(routingTable, List.of(), SingleNodeShutdownMetadata.Type.REMOVE);
143143

144144
ShutdownShardMigrationStatus status = TransportGetShutdownStatusAction.shardMigrationStatus(
145145
state,
@@ -483,17 +483,16 @@ public void testOnlyInitializingShardsRemaining() {
483483

484484
public void testNodeNotInCluster() {
485485
String bogusNodeId = randomAlphaOfLength(10);
486-
Map<String, IndexMetadata> indicesTable = new HashMap<>();
487486
RoutingTable.Builder routingTable = RoutingTable.builder();
488487

489488
ClusterState state = ClusterState.builder(ClusterState.EMPTY_STATE)
490489
.metadata(
491490
Metadata.builder()
492-
.indices(indicesTable)
491+
.indices(Map.of())
493492
.putCustom(
494493
NodesShutdownMetadata.TYPE,
495494
new NodesShutdownMetadata(
496-
Collections.singletonMap(
495+
Map.of(
497496
bogusNodeId,
498497
SingleNodeShutdownMetadata.builder()
499498
.setType(SingleNodeShutdownMetadata.Type.REMOVE)
@@ -580,8 +579,40 @@ private ClusterState createTestClusterState(
580579
List<IndexMetadata> indices,
581580
SingleNodeShutdownMetadata.Type shutdownType
582581
) {
583-
Map<String, IndexMetadata> indicesTable = new HashMap<>();
584-
indices.forEach(imd -> { indicesTable.put(imd.getIndex().getName(), imd); });
582+
return createTestClusterState(indexRoutingTable, indices, shutdownType, false);
583+
}
584+
585+
private ClusterState createTestClusterState(
586+
RoutingTable indexRoutingTable,
587+
List<IndexMetadata> indices,
588+
SingleNodeShutdownMetadata.Type shutdownType,
589+
boolean shuttingDownNodeAlreadyLeft
590+
) {
591+
Map<String, IndexMetadata> indicesTable = indices.stream().collect(toMap(imd -> imd.getIndex().getName(), Function.identity()));
592+
DiscoveryNodes.Builder discoveryNodesBuilder = DiscoveryNodes.builder()
593+
.add(
594+
DiscoveryNode.createLocal(
595+
Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), LIVE_NODE_ID).build(),
596+
new TransportAddress(TransportAddress.META_ADDRESS, 9201),
597+
LIVE_NODE_ID
598+
)
599+
)
600+
.add(
601+
DiscoveryNode.createLocal(
602+
Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), OTHER_LIVE_NODE_ID).build(),
603+
new TransportAddress(TransportAddress.META_ADDRESS, 9202),
604+
OTHER_LIVE_NODE_ID
605+
)
606+
);
607+
if (shuttingDownNodeAlreadyLeft == false) {
608+
discoveryNodesBuilder.add(
609+
DiscoveryNode.createLocal(
610+
Settings.builder().put(Node.NODE_NAME_SETTING.getKey(), SHUTTING_DOWN_NODE_ID).build(),
611+
new TransportAddress(TransportAddress.META_ADDRESS, 9200),
612+
SHUTTING_DOWN_NODE_ID
613+
)
614+
);
615+
}
585616

586617
return ClusterState.builder(ClusterState.EMPTY_STATE)
587618
.metadata(
@@ -590,7 +621,7 @@ private ClusterState createTestClusterState(
590621
.putCustom(
591622
NodesShutdownMetadata.TYPE,
592623
new NodesShutdownMetadata(
593-
Collections.singletonMap(
624+
Map.of(
594625
SHUTTING_DOWN_NODE_ID,
595626
SingleNodeShutdownMetadata.builder()
596627
.setType(shutdownType)
@@ -602,36 +633,7 @@ private ClusterState createTestClusterState(
602633
)
603634
)
604635
)
605-
.nodes(
606-
DiscoveryNodes.builder()
607-
.add(
608-
DiscoveryNode.createLocal(
609-
Settings.builder()
610-
.put(Settings.builder().build())
611-
.put(Node.NODE_NAME_SETTING.getKey(), SHUTTING_DOWN_NODE_ID)
612-
.build(),
613-
new TransportAddress(TransportAddress.META_ADDRESS, 9200),
614-
SHUTTING_DOWN_NODE_ID
615-
)
616-
)
617-
.add(
618-
DiscoveryNode.createLocal(
619-
Settings.builder().put(Settings.builder().build()).put(Node.NODE_NAME_SETTING.getKey(), LIVE_NODE_ID).build(),
620-
new TransportAddress(TransportAddress.META_ADDRESS, 9201),
621-
LIVE_NODE_ID
622-
)
623-
)
624-
.add(
625-
DiscoveryNode.createLocal(
626-
Settings.builder()
627-
.put(Settings.builder().build())
628-
.put(Node.NODE_NAME_SETTING.getKey(), OTHER_LIVE_NODE_ID)
629-
.build(),
630-
new TransportAddress(TransportAddress.META_ADDRESS, 9202),
631-
OTHER_LIVE_NODE_ID
632-
)
633-
)
634-
)
636+
.nodes(discoveryNodesBuilder)
635637
.routingTable(indexRoutingTable)
636638
.build();
637639
}
@@ -646,7 +648,7 @@ private UnassignedInfo makeUnassignedInfo(String nodeId) {
646648
System.currentTimeMillis(),
647649
false,
648650
UnassignedInfo.AllocationStatus.NO_ATTEMPT,
649-
Collections.emptySet(),
651+
Set.of(),
650652
nodeId
651653
);
652654
}
@@ -680,7 +682,12 @@ private ShutdownShardMigrationStatus getUnassignedShutdownStatus(Index index, In
680682

681683
RoutingTable.Builder routingTable = RoutingTable.builder();
682684
routingTable.add(indexRoutingTable);
683-
ClusterState state = createTestClusterState(routingTable.build(), List.of(imd), SingleNodeShutdownMetadata.Type.REMOVE);
685+
ClusterState state = createTestClusterState(
686+
routingTable.build(),
687+
List.of(imd),
688+
SingleNodeShutdownMetadata.Type.REMOVE,
689+
randomBoolean()
690+
);
684691

685692
return TransportGetShutdownStatusAction.shardMigrationStatus(
686693
state,

0 commit comments

Comments
 (0)