Reset relocation/allocation failure counter on node join/shutdown (#119968)

pxsalehi · web-flow · commit b94a20e2cd28 · 2025-01-28T11:15:25.000+01:00
We prevent retries of allocations/relocations once they see index.allocation.max_retries failed attempts (default 5). In #108987, we added reseting the allocation failure counters when a node joins the cluster. As discussed in the linked discussion, it would make sense to extend this reset also to relocations AND also consider node shutdown events. With this change we reset both allocation/relocation failures if a new node joins the cluster or a shutdown metadata is applied. The subset of shutdown events that we consider and how we track them is more or less copied from what was done for #106998. To me the logic seemed to make sense here too. Closes ES-10492
diff --git a/docs/changelog/119968.yaml b/docs/changelog/119968.yaml
@@ -0,0 +1,5 @@
+pr: 119968
+summary: Reset relocation/allocation failure counter on node join/shutdown
+area: Allocation
+type: enhancement
+issues: []
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/AllocationFailuresResetIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/AllocationFailuresResetIT.java
@@ -9,17 +9,29 @@
 
 package org.elasticsearch.cluster.routing.allocation;
 
+import org.apache.logging.log4j.Level;
+import org.elasticsearch.cluster.routing.RoutingNodes;
 import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.ShardRoutingState;
 import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.Index;
 import org.elasticsearch.index.shard.IndexEventListener;
 import org.elasticsearch.plugins.Plugin;
 import org.elasticsearch.test.ESIntegTestCase;
 import org.elasticsearch.test.ESIntegTestCase.ClusterScope;
 import org.elasticsearch.test.ESIntegTestCase.Scope;
 import org.elasticsearch.test.MockIndexEventListener;
+import org.elasticsearch.test.MockLog;
 
 import java.util.List;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_ROUTING_EXCLUDE_GROUP_PREFIX;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.not;
+import static org.hamcrest.CoreMatchers.notNullValue;
 
 @ClusterScope(scope = Scope.TEST, numDataNodes = 0)
 public class AllocationFailuresResetIT extends ESIntegTestCase {
@@ -49,7 +61,7 @@ private void removeAllocationFailuresInjection(String node) {
     private void awaitShardAllocMaxRetries() throws Exception {
         var maxRetries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(internalCluster().getDefaultSettings());
         assertBusy(() -> {
-            var state = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).get().getState();
+            var state = safeGet(clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).execute()).getState();
             var index = state.getRoutingTable().index(INDEX);
             assertNotNull(index);
             var shard = index.shard(SHARD).primaryShard();
@@ -62,7 +74,7 @@ private void awaitShardAllocMaxRetries() throws Exception {
 
     private void awaitShardAllocSucceed() throws Exception {
         assertBusy(() -> {
-            var state = clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).get().getState();
+            var state = safeGet(clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).execute()).getState();
             var index = state.getRoutingTable().index(INDEX);
             assertNotNull(index);
             var shard = index.shard(SHARD).primaryShard();
@@ -72,14 +84,77 @@ private void awaitShardAllocSucceed() throws Exception {
         });
     }
 
-    public void testResetFailuresOnNodeJoin() throws Exception {
+    public void testResetAllocationFailuresOnNodeJoin() throws Exception {
         var node1 = internalCluster().startNode();
         injectAllocationFailures(node1);
         prepareCreate(INDEX, indexSettings(1, 0)).execute();
         awaitShardAllocMaxRetries();
         removeAllocationFailuresInjection(node1);
-        internalCluster().startNode();
-        awaitShardAllocSucceed();
+        try (var mockLog = MockLog.capture(RoutingNodes.class)) {
+            var shardId = internalCluster().clusterService().state().routingTable().index(INDEX).shard(SHARD).shardId();
+            mockLog.addExpectation(
+                new MockLog.SeenEventExpectation(
+                    "log resetting failed allocations",
+                    RoutingNodes.class.getName(),
+                    Level.INFO,
+                    Strings.format(RoutingNodes.RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG, 1, List.of(shardId))
+                )
+            );
+            internalCluster().startNode();
+            awaitShardAllocSucceed();
+            mockLog.assertAllExpectationsMatched();
+        }
     }
 
+    public void testResetRelocationFailuresOnNodeJoin() throws Exception {
+        String node1 = internalCluster().startNode();
+        createIndex(INDEX, 1, 0);
+        ensureGreen(INDEX);
+        final var failRelocation = new AtomicBoolean(true);
+        String node2 = internalCluster().startNode();
+        internalCluster().getInstance(MockIndexEventListener.TestEventListener.class, node2).setNewDelegate(new IndexEventListener() {
+            @Override
+            public void beforeIndexCreated(Index index, Settings indexSettings) {
+                if (failRelocation.get()) {
+                    throw new RuntimeException("FAIL");
+                }
+            }
+        });
+        updateIndexSettings(Settings.builder().put(INDEX_ROUTING_EXCLUDE_GROUP_PREFIX + "._name", node1), INDEX);
+        ensureGreen(INDEX);
+        // await all relocation attempts are exhausted
+        var maxAttempts = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
+        assertBusy(() -> {
+            var state = safeGet(clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).execute()).getState();
+            var shard = state.routingTable().index(INDEX).shard(SHARD).primaryShard();
+            assertThat(shard, notNullValue());
+            assertThat(shard.relocationFailureInfo().failedRelocations(), equalTo(maxAttempts));
+        });
+        // ensure the shard remain started
+        var state = safeGet(clusterAdmin().prepareState(TEST_REQUEST_TIMEOUT).execute()).getState();
+        var shard = state.routingTable().index(INDEX).shard(SHARD).primaryShard();
+        assertThat(shard, notNullValue());
+        assertThat(shard.state(), equalTo(ShardRoutingState.STARTED));
+        assertThat(state.nodes().get(shard.currentNodeId()).getName(), equalTo(node1));
+        failRelocation.set(false);
+        // A new node joining should reset the counter and allow more relocation retries
+        try (var mockLog = MockLog.capture(RoutingNodes.class)) {
+            mockLog.addExpectation(
+                new MockLog.SeenEventExpectation(
+                    "log resetting failed relocations",
+                    RoutingNodes.class.getName(),
+                    Level.INFO,
+                    Strings.format(RoutingNodes.RESET_FAILED_RELOCATION_COUNTER_LOG_MSG, 1, List.of(shard.shardId()))
+                )
+            );
+            internalCluster().startNode();
+            assertBusy(() -> {
+                var stateAfterNodeJoin = internalCluster().clusterService().state();
+                var relocatedShard = stateAfterNodeJoin.routingTable().index(INDEX).shard(SHARD).primaryShard();
+                assertThat(relocatedShard, notNullValue());
+                assertThat(stateAfterNodeJoin.nodes().get(relocatedShard.currentNodeId()).getName(), not(equalTo(node1)));
+            });
+            mockLog.assertAllExpectationsMatched();
+        }
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java
@@ -16,14 +16,19 @@
 import org.elasticsearch.cluster.node.DiscoveryNodes;
 import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
 import org.elasticsearch.cluster.routing.allocation.ExistingShardsAllocator;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceMetrics;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.util.Maps;
 import org.elasticsearch.core.Assertions;
 import org.elasticsearch.core.Nullable;
 import org.elasticsearch.core.Tuple;
 import org.elasticsearch.index.Index;
+import org.elasticsearch.index.IndexNotFoundException;
 import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.logging.LogManager;
+import org.elasticsearch.logging.Logger;
 
 import java.util.ArrayDeque;
 import java.util.ArrayList;
@@ -44,6 +49,8 @@
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 
+import static org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY;
+
 /**
  * {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
  * It can be either initialized as mutable or immutable allowing or disallowing changes to its elements.
@@ -60,6 +67,13 @@
  */
 public class RoutingNodes implements Iterable<RoutingNode> {
 
+    private static final Logger logger = LogManager.getLogger(RoutingNodes.class);
+    public static final String RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG =
+        "Resetting failure counter for %d shard(s) that have reached their max allocation retires (%s)";
+    public static final String RESET_FAILED_RELOCATION_COUNTER_LOG_MSG =
+        "Resetting failure counter for %d shard(s) that have reached their max relocation retries (%s)";
+    private static final int MAX_SHARDS_IN_LOG_MSG = 20;
+
     private final Map<String, RoutingNode> nodesToShards;
 
     private final UnassignedShards unassignedShards;
@@ -1298,14 +1312,47 @@ public boolean hasAllocationFailures() {
         }));
     }
 
-    public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
+    public boolean hasRelocationFailures() {
+        for (var shardRoutings : assignedShards.values()) {
+            for (var routing : shardRoutings) {
+                if (routing.relocationFailureInfo() != null && routing.relocationFailureInfo().failedRelocations() > 0) {
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+    public void resetFailedCounter(RoutingAllocation allocation) {
+        final var observer = allocation.changes();
+        int shardsWithMaxFailedAllocations = 0;
+        int shardsWithMaxFailedRelocations = 0;
+        List<ShardId> topShardIdsWithFailedAllocations = new ArrayList<>();
+        List<ShardId> topShardIdsWithFailedRelocations = new ArrayList<>();
+
         final var unassignedIterator = unassigned().iterator();
         while (unassignedIterator.hasNext()) {
             ShardRouting shardRouting = unassignedIterator.next();
             UnassignedInfo unassignedInfo = shardRouting.unassignedInfo();
+            int failedAllocations = unassignedInfo.failedAllocations();
+            if (failedAllocations > 0) {
+                try {
+                    final var maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(
+                        allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()
+                    );
+                    if (failedAllocations >= maxRetry) {
+                        shardsWithMaxFailedAllocations++;
+                        if (topShardIdsWithFailedAllocations.size() <= MAX_SHARDS_IN_LOG_MSG) {
+                            topShardIdsWithFailedAllocations.add(shardRouting.shardId());
+                        }
+                    }
+                } catch (IndexNotFoundException e) {
+                    // ignore
+                }
+            }
             unassignedIterator.updateUnassigned(
                 new UnassignedInfo(
-                    unassignedInfo.failedAllocations() > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.reason(),
+                    failedAllocations > 0 ? UnassignedInfo.Reason.MANUAL_ALLOCATION : unassignedInfo.reason(),
                     unassignedInfo.message(),
                     unassignedInfo.failure(),
                     0,
@@ -1317,7 +1364,7 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
                     unassignedInfo.lastAllocatedNodeId()
                 ),
                 shardRouting.recoverySource(),
-                routingChangesObserver
+                observer
             );
         }
 
@@ -1326,6 +1373,20 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
             for (ShardRouting shardRouting : routingNode) {
                 if (shardRouting.relocationFailureInfo() != null && shardRouting.relocationFailureInfo().failedRelocations() > 0) {
                     shardsWithRelocationFailures.add(shardRouting);
+                    try {
+                        int failedRelocations = shardRouting.relocationFailureInfo().failedRelocations();
+                        final var maxRetry = SETTING_ALLOCATION_MAX_RETRY.get(
+                            allocation.metadata().getIndexSafe(shardRouting.index()).getSettings()
+                        );
+                        if (failedRelocations >= maxRetry) {
+                            shardsWithMaxFailedRelocations++;
+                            if (topShardIdsWithFailedRelocations.size() <= MAX_SHARDS_IN_LOG_MSG) {
+                                topShardIdsWithFailedRelocations.add(shardRouting.shardId());
+                            }
+                        }
+                    } catch (IndexNotFoundException e) {
+                        // ignore
+                    }
                 }
             }
 
@@ -1336,6 +1397,17 @@ public void resetFailedCounter(RoutingChangesObserver routingChangesObserver) {
                 assignedShardsAdd(updated);
             }
         }
+
+        if (shardsWithMaxFailedAllocations > 0) {
+            logger.info(
+                Strings.format(RESET_FAILED_ALLOCATION_COUNTER_LOG_MSG, shardsWithMaxFailedAllocations, topShardIdsWithFailedAllocations)
+            );
+        }
+        if (shardsWithMaxFailedRelocations > 0) {
+            logger.info(
+                Strings.format(RESET_FAILED_RELOCATION_COUNTER_LOG_MSG, shardsWithMaxFailedRelocations, topShardIdsWithFailedRelocations)
+            );
+        }
     }
 
     /**
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
@@ -12,13 +12,15 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.RestoreInProgress;
 import org.elasticsearch.cluster.health.ClusterHealthStatus;
 import org.elasticsearch.cluster.metadata.AutoExpandReplicas;
 import org.elasticsearch.cluster.metadata.IndexMetadata;
 import org.elasticsearch.cluster.metadata.Metadata;
+import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata;
 import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type;
 import org.elasticsearch.cluster.node.DiscoveryNode;
 import org.elasticsearch.cluster.routing.IndexRoutingTable;
@@ -573,15 +575,71 @@ public void addAllocFailuresResetListenerTo(ClusterService clusterService) {
         });
 
         clusterService.addListener((changeEvent) -> {
-            if (changeEvent.nodesAdded() && changeEvent.state().getRoutingNodes().hasAllocationFailures()) {
+            if (shouldResetAllocationFailures(changeEvent)) {
                 taskQueue.submitTask("reset-allocation-failures", (e) -> { assert MasterService.isPublishFailureException(e); }, null);
             }
         });
     }
 
+    /**
+     *  We should reset allocation/relocation failure count to allow further retries when:
+     *
+     *  1. A new node joins the cluster.
+     *  2. A node shutdown metadata is added that could lead to a node being removed or replaced in the cluster.
+     *
+     * Note that removing a non-RESTART shutdown metadata from a node that is still in the cluster is treated similarly and
+     * will cause resetting the allocation/relocation failures.
+     */
+    private boolean shouldResetAllocationFailures(ClusterChangedEvent changeEvent) {
+        final var clusterState = changeEvent.state();
+
+        if (clusterState.getRoutingNodes().hasAllocationFailures() == false
+            && clusterState.getRoutingNodes().hasRelocationFailures() == false) {
+            return false;
+        }
+        if (changeEvent.nodesAdded()) {
+            return true;
+        }
+
+        final var currentNodeShutdowns = clusterState.metadata().nodeShutdowns();
+        final var previousNodeShutdowns = changeEvent.previousState().metadata().nodeShutdowns();
+
+        if (currentNodeShutdowns.equals(previousNodeShutdowns)) {
+            return false;
+        }
+
+        for (var currentShutdown : currentNodeShutdowns.getAll().entrySet()) {
+            var previousNodeShutdown = previousNodeShutdowns.get(currentShutdown.getKey());
+            if (currentShutdown.equals(previousNodeShutdown)) {
+                continue;
+            }
+            // A RESTART doesn't necessarily move around shards, so no need to consider it for a reset.
+            // Furthermore, once the node rejoins after restarting, there will be a reset if necessary.
+            if (currentShutdown.getValue().getType() == SingleNodeShutdownMetadata.Type.RESTART) {
+                continue;
+            }
+            // A node with no shutdown marker or a RESTART marker receives a non-RESTART shutdown marker
+            if (previousNodeShutdown == null || previousNodeShutdown.getType() == Type.RESTART) {
+                return true;
+            }
+        }
+
+        for (var previousShutdown : previousNodeShutdowns.getAll().entrySet()) {
+            var nodeId = previousShutdown.getKey();
+            // A non-RESTART marker is removed but the node is still in the cluster. We could re-attempt failed relocations/allocations.
+            if (currentNodeShutdowns.get(nodeId) == null
+                && previousShutdown.getValue().getType() != SingleNodeShutdownMetadata.Type.RESTART
+                && clusterState.nodes().get(nodeId) != null) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
     private ClusterState rerouteWithResetFailedCounter(ClusterState clusterState) {
         RoutingAllocation allocation = createRoutingAllocation(clusterState, currentNanoTime());
-        allocation.routingNodes().resetFailedCounter(allocation.changes());
+        allocation.routingNodes().resetFailedCounter(allocation);
         reroute(allocation, routingAllocation -> shardsAllocator.allocate(routingAllocation, ActionListener.noop()));
         return buildResultAndLogHealthChange(clusterState, allocation, "reroute with reset failed counter");
     }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/ShardsAllocator.java
@@ -62,7 +62,7 @@ default RoutingExplanations execute(RoutingAllocation allocation, AllocationComm
 
         try {
             if (retryFailed) {
-                allocation.routingNodes().resetFailedCounter(allocation.changes());
+                allocation.routingNodes().resetFailedCounter(allocation);
             }
             return commands.execute(allocation, explain);
         } finally {
diff --git a/x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/AllocationFailuresResetOnShutdownIT.java b/x-pack/plugin/shutdown/src/internalClusterTest/java/org/elasticsearch/xpack/shutdown/AllocationFailuresResetOnShutdownIT.java

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ default RoutingExplanations execute(RoutingAllocation allocation, AllocationComm`
`62`	`62`
`63`	`63`	`try {`
`64`	`64`	`if (retryFailed) {`
`65`		`- allocation.routingNodes().resetFailedCounter(allocation.changes());`
	`65`	`+ allocation.routingNodes().resetFailedCounter(allocation);`
`66`	`66`	`}`
`67`	`67`	`return commands.execute(allocation, explain);`
`68`	`68`	`} finally {`