|
24 | 24 | import org.elasticsearch.cluster.routing.ShardRouting;
|
25 | 25 | import org.elasticsearch.cluster.routing.ShardRoutingState;
|
26 | 26 | import org.elasticsearch.cluster.routing.TestShardRouting;
|
| 27 | +import org.elasticsearch.cluster.routing.UnassignedInfo; |
27 | 28 | import org.elasticsearch.cluster.routing.allocation.AllocationService;
|
28 | 29 | import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
|
29 | 30 | import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
|
|
34 | 35 | import org.elasticsearch.cluster.routing.allocation.decider.NodeShutdownAllocationDecider;
|
35 | 36 | import org.elasticsearch.common.settings.Settings;
|
36 | 37 | import org.elasticsearch.common.transport.TransportAddress;
|
| 38 | +import org.elasticsearch.gateway.GatewayAllocator; |
37 | 39 | import org.elasticsearch.index.Index;
|
38 | 40 | import org.elasticsearch.index.shard.ShardId;
|
39 | 41 | import org.elasticsearch.node.Node;
|
40 | 42 | import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
|
41 | 43 | import org.elasticsearch.snapshots.SnapshotsInfoService;
|
42 | 44 | import org.elasticsearch.test.ESTestCase;
|
| 45 | +import org.elasticsearch.test.gateway.TestGatewayAllocator; |
43 | 46 | import org.hamcrest.Matcher;
|
44 | 47 | import org.junit.Before;
|
45 | 48 |
|
46 | 49 | import java.util.Collections;
|
47 | 50 | import java.util.HashMap;
|
48 | 51 | import java.util.List;
|
49 | 52 | import java.util.Map;
|
| 53 | +import java.util.Set; |
50 | 54 | import java.util.concurrent.atomic.AtomicReference;
|
51 | 55 |
|
52 | 56 | import static org.hamcrest.Matchers.allOf;
|
53 | 57 | import static org.hamcrest.Matchers.containsString;
|
54 | 58 | import static org.hamcrest.Matchers.equalTo;
|
55 | 59 | import static org.hamcrest.Matchers.is;
|
56 | 60 | import static org.hamcrest.Matchers.nullValue;
|
| 61 | +import static org.mockito.ArgumentMatchers.anyInt; |
| 62 | +import static org.mockito.Mockito.doAnswer; |
| 63 | +import static org.mockito.Mockito.spy; |
57 | 64 |
|
58 | 65 | public class TransportGetShutdownStatusActionTests extends ESTestCase {
|
59 | 66 | public static final String SHUTTING_DOWN_NODE_ID = "node1";
|
@@ -122,6 +129,7 @@ public Decision canRebalance(RoutingAllocation allocation) {
|
122 | 129 | clusterInfoService,
|
123 | 130 | snapshotsInfoService
|
124 | 131 | );
|
| 132 | + allocationService.setExistingShardsAllocators(Map.of(GatewayAllocator.ALLOCATOR_NAME, new TestGatewayAllocator())); |
125 | 133 | }
|
126 | 134 |
|
127 | 135 | /**
|
@@ -349,6 +357,60 @@ public void testStalled() {
|
349 | 357 | );
|
350 | 358 | }
|
351 | 359 |
|
| 360 | + /** |
| 361 | + * Ensure we can detect stalled migrations when we have unassigned shards that had the shutting down node as their last known |
| 362 | + * node id |
| 363 | + */ |
| 364 | + public void testStalledUnassigned() { |
| 365 | + Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20)); |
| 366 | + IndexMetadata imd = spy(generateIndexMetadata(index, 3, 0)); |
| 367 | + // make sure the TestGatewayAllocator stays in sync always, avoid flaky tests |
| 368 | + doAnswer(i -> { |
| 369 | + if ((Integer) i.getArgument(0) < 2) { |
| 370 | + return Set.of(LIVE_NODE_ID); |
| 371 | + } |
| 372 | + return Set.of(SHUTTING_DOWN_NODE_ID); |
| 373 | + }).when(imd).inSyncAllocationIds(anyInt()); |
| 374 | + |
| 375 | + var shard0 = TestShardRouting.newShardRouting(new ShardId(index, 0), LIVE_NODE_ID, true, ShardRoutingState.STARTED); |
| 376 | + var shard1 = TestShardRouting.newShardRouting(new ShardId(index, 1), LIVE_NODE_ID, true, ShardRoutingState.STARTED); |
| 377 | + |
| 378 | + // we should stall the node if we find an unassigned shard with lastAllocatedNodeId matching the shutting down node |
| 379 | + var unassigned = makeUnassignedShard(index, 2, SHUTTING_DOWN_NODE_ID, true); |
| 380 | + |
| 381 | + assertShardMigration( |
| 382 | + getUnassignedShutdownStatus(index, imd, shard0, shard1, unassigned), |
| 383 | + SingleNodeShutdownMetadata.Status.STALLED, |
| 384 | + 1, |
| 385 | + allOf(containsString(index.getName()), containsString("[2] [primary]")) |
| 386 | + ); |
| 387 | + |
| 388 | + // if the shard is unassigned, but it's not a primary on this node, we shouldn't stall |
| 389 | + var shard2 = TestShardRouting.newShardRouting(new ShardId(index, 2), LIVE_NODE_ID, true, ShardRoutingState.STARTED); |
| 390 | + var unassignedReplica = makeUnassignedShard(index, 2, SHUTTING_DOWN_NODE_ID, false); |
| 391 | + |
| 392 | + var s = getUnassignedShutdownStatus(index, imd, shard0, shard1, shard2, unassignedReplica); |
| 393 | + assertShardMigration(s, SingleNodeShutdownMetadata.Status.COMPLETE, 0, nullValue()); |
| 394 | + |
| 395 | + // check if we correctly count all of the unassigned shards |
| 396 | + var unassigned3 = makeUnassignedShard(index, 3, SHUTTING_DOWN_NODE_ID, true); |
| 397 | + |
| 398 | + assertShardMigration( |
| 399 | + getUnassignedShutdownStatus(index, imd, shard0, shard1, unassigned3, unassigned), |
| 400 | + SingleNodeShutdownMetadata.Status.STALLED, |
| 401 | + 2, |
| 402 | + allOf(containsString(index.getName()), containsString("[2] [primary]")) |
| 403 | + ); |
| 404 | + |
| 405 | + // check if we correctly walk all of the unassigned shards, shard 2 replica, shard 3 primary |
| 406 | + assertShardMigration( |
| 407 | + getUnassignedShutdownStatus(index, imd, shard0, shard1, shard2, unassignedReplica, unassigned3), |
| 408 | + SingleNodeShutdownMetadata.Status.STALLED, |
| 409 | + 1, |
| 410 | + allOf(containsString(index.getName()), containsString("[3] [primary]")) |
| 411 | + ); |
| 412 | + } |
| 413 | + |
352 | 414 | public void testNotStalledIfAllShardsHaveACopyOnAnotherNode() {
|
353 | 415 | Index index = new Index(randomAlphaOfLength(5), randomAlphaOfLengthBetween(1, 20));
|
354 | 416 | IndexMetadata imd = generateIndexMetadata(index, 3, 0);
|
@@ -571,4 +633,62 @@ private ClusterState createTestClusterState(
|
571 | 633 | .routingTable(indexRoutingTable)
|
572 | 634 | .build();
|
573 | 635 | }
|
| 636 | + |
| 637 | + private UnassignedInfo makeUnassignedInfo(String nodeId) { |
| 638 | + return new UnassignedInfo( |
| 639 | + UnassignedInfo.Reason.ALLOCATION_FAILED, |
| 640 | + "testing", |
| 641 | + null, |
| 642 | + 1, |
| 643 | + System.nanoTime(), |
| 644 | + System.currentTimeMillis(), |
| 645 | + false, |
| 646 | + UnassignedInfo.AllocationStatus.NO_ATTEMPT, |
| 647 | + Collections.emptySet(), |
| 648 | + nodeId |
| 649 | + ); |
| 650 | + } |
| 651 | + |
| 652 | + private ShardRouting makeUnassignedShard(Index index, int shardId, String nodeId, boolean primary) { |
| 653 | + var unsignedInfo = makeUnassignedInfo(nodeId); |
| 654 | + |
| 655 | + return TestShardRouting.newShardRouting( |
| 656 | + new ShardId(index, shardId), |
| 657 | + null, |
| 658 | + null, |
| 659 | + primary, |
| 660 | + ShardRoutingState.UNASSIGNED, |
| 661 | + unsignedInfo |
| 662 | + ); |
| 663 | + } |
| 664 | + |
| 665 | + private ShutdownShardMigrationStatus getUnassignedShutdownStatus(Index index, IndexMetadata imd, ShardRouting... shards) { |
| 666 | + var indexRoutingTableBuilder = IndexRoutingTable.builder(index); |
| 667 | + |
| 668 | + for (var routing : shards) { |
| 669 | + indexRoutingTableBuilder.addShard(routing); |
| 670 | + } |
| 671 | + |
| 672 | + var indexRoutingTable = indexRoutingTableBuilder.build(); |
| 673 | + |
| 674 | + // Force a decision of NO for all moves and new allocations, simulating a decider that's stuck |
| 675 | + canAllocate.set((r, n, a) -> Decision.NO); |
| 676 | + // And the remain decider simulates NodeShutdownAllocationDecider |
| 677 | + canRemain.set((r, n, a) -> n.nodeId().equals(SHUTTING_DOWN_NODE_ID) ? Decision.NO : Decision.YES); |
| 678 | + |
| 679 | + RoutingTable.Builder routingTable = RoutingTable.builder(); |
| 680 | + routingTable.add(indexRoutingTable); |
| 681 | + ClusterState state = createTestClusterState(routingTable.build(), List.of(imd), SingleNodeShutdownMetadata.Type.REMOVE); |
| 682 | + |
| 683 | + return TransportGetShutdownStatusAction.shardMigrationStatus( |
| 684 | + state, |
| 685 | + SHUTTING_DOWN_NODE_ID, |
| 686 | + SingleNodeShutdownMetadata.Type.REMOVE, |
| 687 | + true, |
| 688 | + clusterInfoService, |
| 689 | + snapshotsInfoService, |
| 690 | + allocationService, |
| 691 | + allocationDeciders |
| 692 | + ); |
| 693 | + } |
574 | 694 | }
|
0 commit comments