From affdae515391a07eb893c81ca55680a934fc1001 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 23 Oct 2025 16:48:25 +1100 Subject: [PATCH 01/44] Log when no progress is made towards the desired balance for some time --- .../allocator/DesiredBalanceReconciler.java | 87 +++++++++++++++++-- .../common/settings/ClusterSettings.java | 1 + 2 files changed, 83 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 38be34c0ff610..ab586c219ebb1 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -30,16 +30,18 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.time.TimeProvider; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.core.TimeValue; import org.elasticsearch.core.Tuple; import org.elasticsearch.gateway.PriorityComparator; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.shard.ShardId; -import org.elasticsearch.threadpool.ThreadPool; import java.util.Comparator; import java.util.Iterator; import java.util.Set; +import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiFunction; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -54,6 +56,7 @@ public class DesiredBalanceReconciler { private static final Logger logger = LogManager.getLogger(DesiredBalanceReconciler.class); + private static final int IMMOVABLE_SHARDS_SAMPLE_SIZE = 3; /** * The minimum interval that log messages will be written if the number of undesired shard allocations reaches the percentage of total @@ -79,21 +82,45 @@ public class DesiredBalanceReconciler { Setting.Property.NodeScope ); + /** + * Warning logs will be periodically written if we haven't seen any progress towards balance in the time specified + */ + public static final Setting NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING = Setting.timeSetting( + "cluster.routing.allocation.desired_balance.no_progress_towards_balance_logging.threshold", + TimeValue.timeValueMinutes(5), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + + private final TimeProvider timeProvider; + private final AtomicLong lastProgressTowardsBalanceTimestampMillis = new AtomicLong(0L); private final FrequencyCappedAction undesiredAllocationLogInterval; + private final FrequencyCappedAction noProgressTowardsBalanceLogInterval; private double undesiredAllocationsLogThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); + private volatile TimeValue noProgressTowardsBalanceThreshold; - public DesiredBalanceReconciler(ClusterSettings clusterSettings, ThreadPool threadPool) { - this.undesiredAllocationLogInterval = new FrequencyCappedAction( - threadPool.relativeTimeInMillisSupplier(), + public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { + this.timeProvider = timeProvider; + this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); + this.noProgressTowardsBalanceLogInterval = new FrequencyCappedAction( + timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5) ); - clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, this.undesiredAllocationLogInterval::setMinInterval); + this.lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); + clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { + this.undesiredAllocationLogInterval.setMinInterval(value); + this.noProgressTowardsBalanceLogInterval.setMinInterval(value); + }); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, value -> this.undesiredAllocationsLogThreshold = value ); + clusterSettings.initializeAndWatch( + NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING, + value -> this.noProgressTowardsBalanceThreshold = value + ); } /** @@ -532,6 +559,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { int undesiredAllocationsExcludingShuttingDownNodes = 0; final ObjectLongMap totalAllocationsByRole = new ObjectLongHashMap<>(); final ObjectLongMap undesiredAllocationsExcludingShuttingDownNodesByRole = new ObjectLongHashMap<>(); + final Set undesiredShardSample = Sets.newHashSetWithExpectedSize(IMMOVABLE_SHARDS_SAMPLE_SIZE); // Iterate over all started shards and try to move any which are on undesired nodes. In the presence of throttling shard // movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the @@ -587,6 +615,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { shardRouting.currentNodeId(), rebalanceTarget.getId() ); + lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); routingNodes.relocateShard( shardRouting, @@ -597,9 +626,14 @@ private DesiredBalanceMetrics.AllocationStats balance() { ); iterator.dePrioritizeNode(shardRouting.currentNodeId()); moveOrdering.recordAllocation(shardRouting.currentNodeId()); + } else { + if (undesiredShardSample.size() < IMMOVABLE_SHARDS_SAMPLE_SIZE) { + undesiredShardSample.add(shardRouting); + } } } + maybeLogProgressTowardsDesiredWarning(undesiredAllocationsExcludingShuttingDownNodes, undesiredShardSample); maybeLogUndesiredAllocationsWarning(totalAllocations, undesiredAllocationsExcludingShuttingDownNodes, routingNodes.size()); return new DesiredBalanceMetrics.AllocationStats( unassignedShards, @@ -616,6 +650,49 @@ private DesiredBalanceMetrics.AllocationStats balance() { ); } + /** + * If there are undesired allocations, and we have made no recent progress towards the desired balance, log a warning + * + * @param undesiredAllocationsExcludingShuttingDownNodes The number of undesired allocations (excluding shutting down nodes) + * @param immovableShardsSample A sample of the shards we found to be immovable + */ + private void maybeLogProgressTowardsDesiredWarning( + int undesiredAllocationsExcludingShuttingDownNodes, + Set immovableShardsSample + ) { + // There are no undesired allocations, reset the last-progress timestamp + if (undesiredAllocationsExcludingShuttingDownNodes == 0) { + lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); + return; + } + + final long millisecondsSinceLastProgress = timeProvider.relativeTimeInMillis() - lastProgressTowardsBalanceTimestampMillis + .get(); + if (millisecondsSinceLastProgress > noProgressTowardsBalanceThreshold.millis()) { + noProgressTowardsBalanceLogInterval.maybeExecute(() -> { + TimeValue timeSinceProgress = TimeValue.timeValueMillis(millisecondsSinceLastProgress); + logger.warn( + "No progress has been made towards desired balance for [{}], this exceeds the warn threshold of [{}]", + timeSinceProgress, + noProgressTowardsBalanceThreshold + ); + final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); + allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); + try { + for (final var shardRouting : immovableShardsSample) { + final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); + for (final var nodeId : assignment.nodeIds()) { + final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); + logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); + } + } + } finally { + allocation.setDebugMode(originalDebugMode); + } + }); + } + } + private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 595149ff26bb5..711b68bd3abe1 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -235,6 +235,7 @@ public void apply(Settings value, Settings current, Settings previous) { DesiredBalanceComputer.MAX_BALANCE_COMPUTATION_TIME_DURING_INDEX_CREATION_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, + DesiredBalanceReconciler.NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE, From 5249be9b7fccf08592f4537cd83732811c873f00 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 11:49:08 +1100 Subject: [PATCH 02/44] Track immovable shards individually --- .../allocator/DesiredBalanceReconciler.java | 127 ++++++++++-------- .../common/settings/ClusterSettings.java | 2 +- 2 files changed, 70 insertions(+), 59 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index ab586c219ebb1..8669e767a058a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -11,6 +11,8 @@ import com.carrotsearch.hppc.ObjectLongHashMap; import com.carrotsearch.hppc.ObjectLongMap; +import com.carrotsearch.hppc.procedures.LongProcedure; +import com.carrotsearch.hppc.procedures.ObjectLongProcedure; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -31,7 +33,7 @@ import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.time.TimeProvider; -import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.core.Nullable; import org.elasticsearch.core.TimeValue; import org.elasticsearch.core.Tuple; import org.elasticsearch.gateway.PriorityComparator; @@ -56,7 +58,6 @@ public class DesiredBalanceReconciler { private static final Logger logger = LogManager.getLogger(DesiredBalanceReconciler.class); - private static final int IMMOVABLE_SHARDS_SAMPLE_SIZE = 3; /** * The minimum interval that log messages will be written if the number of undesired shard allocations reaches the percentage of total @@ -83,10 +84,10 @@ public class DesiredBalanceReconciler { ); /** - * Warning logs will be periodically written if we haven't seen any progress towards balance in the time specified + * Warning logs will be periodically written if we see a shard that's been unable to be allocated for this long */ - public static final Setting NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING = Setting.timeSetting( - "cluster.routing.allocation.desired_balance.no_progress_towards_balance_logging.threshold", + public static final Setting IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING = Setting.timeSetting( + "cluster.routing.allocation.desired_balance.immovable_shard_logging.threshold", TimeValue.timeValueMinutes(5), Setting.Property.Dynamic, Setting.Property.NodeScope @@ -99,7 +100,8 @@ public class DesiredBalanceReconciler { private double undesiredAllocationsLogThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); - private volatile TimeValue noProgressTowardsBalanceThreshold; + private volatile TimeValue immovableShardThreshold; + private final ObjectLongHashMap immovableShards = new ObjectLongHashMap<>(); public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; @@ -117,10 +119,7 @@ public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider ti UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, value -> this.undesiredAllocationsLogThreshold = value ); - clusterSettings.initializeAndWatch( - NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING, - value -> this.noProgressTowardsBalanceThreshold = value - ); + clusterSettings.initializeAndWatch(IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING, value -> this.immovableShardThreshold = value); } /** @@ -141,6 +140,7 @@ public DesiredBalanceMetrics.AllocationStats reconcile(DesiredBalance desiredBal public void clear() { allocationOrdering.clear(); moveOrdering.clear(); + immovableShards.clear(); } /** @@ -559,7 +559,6 @@ private DesiredBalanceMetrics.AllocationStats balance() { int undesiredAllocationsExcludingShuttingDownNodes = 0; final ObjectLongMap totalAllocationsByRole = new ObjectLongHashMap<>(); final ObjectLongMap undesiredAllocationsExcludingShuttingDownNodesByRole = new ObjectLongHashMap<>(); - final Set undesiredShardSample = Sets.newHashSetWithExpectedSize(IMMOVABLE_SHARDS_SAMPLE_SIZE); // Iterate over all started shards and try to move any which are on undesired nodes. In the presence of throttling shard // movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the @@ -607,19 +606,21 @@ private DesiredBalanceMetrics.AllocationStats balance() { continue; } - final var rebalanceTarget = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); - if (rebalanceTarget != null) { + final var rebalanceDecision = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); + final var rebalanceTargetNode = rebalanceDecision.chosenNode(); + if (rebalanceTargetNode != null) { + immovableShards.remove(shardRouting); logger.debug( "Rebalancing shard {} from {} to {}", shardRouting.shardId(), shardRouting.currentNodeId(), - rebalanceTarget.getId() + rebalanceTargetNode.getId() ); lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); routingNodes.relocateShard( shardRouting, - rebalanceTarget.getId(), + rebalanceTargetNode.getId(), allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), "rebalance", allocation.changes() @@ -627,13 +628,16 @@ private DesiredBalanceMetrics.AllocationStats balance() { iterator.dePrioritizeNode(shardRouting.currentNodeId()); moveOrdering.recordAllocation(shardRouting.currentNodeId()); } else { - if (undesiredShardSample.size() < IMMOVABLE_SHARDS_SAMPLE_SIZE) { - undesiredShardSample.add(shardRouting); + // Start tracking this shard as immovable if we are not already + if (rebalanceDecision.bestDecision() == null || rebalanceDecision.bestDecision() == Decision.NO) { + if (immovableShards.containsKey(shardRouting) == false) { + immovableShards.put(shardRouting, timeProvider.relativeTimeInMillis()); + } } } } - maybeLogProgressTowardsDesiredWarning(undesiredAllocationsExcludingShuttingDownNodes, undesiredShardSample); + maybeLogImmovableShardsWarning(); maybeLogUndesiredAllocationsWarning(totalAllocations, undesiredAllocationsExcludingShuttingDownNodes, routingNodes.size()); return new DesiredBalanceMetrics.AllocationStats( unassignedShards, @@ -651,48 +655,50 @@ private DesiredBalanceMetrics.AllocationStats balance() { } /** - * If there are undesired allocations, and we have made no recent progress towards the desired balance, log a warning - * - * @param undesiredAllocationsExcludingShuttingDownNodes The number of undesired allocations (excluding shutting down nodes) - * @param immovableShardsSample A sample of the shards we found to be immovable + * If there are shards that have been immovable for a long time, log a warning */ - private void maybeLogProgressTowardsDesiredWarning( - int undesiredAllocationsExcludingShuttingDownNodes, - Set immovableShardsSample - ) { - // There are no undesired allocations, reset the last-progress timestamp - if (undesiredAllocationsExcludingShuttingDownNodes == 0) { - lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); - return; + private void maybeLogImmovableShardsWarning() { + final long currentTimeMillis = timeProvider.relativeTimeInMillis(); + if (currentTimeMillis - oldestImmovableShardTimestamp() > immovableShardThreshold.millis()) { + noProgressTowardsBalanceLogInterval.maybeExecute(this::logDecisionsForImmovableShardsOverThreshold); } + } - final long millisecondsSinceLastProgress = timeProvider.relativeTimeInMillis() - lastProgressTowardsBalanceTimestampMillis - .get(); - if (millisecondsSinceLastProgress > noProgressTowardsBalanceThreshold.millis()) { - noProgressTowardsBalanceLogInterval.maybeExecute(() -> { - TimeValue timeSinceProgress = TimeValue.timeValueMillis(millisecondsSinceLastProgress); - logger.warn( - "No progress has been made towards desired balance for [{}], this exceeds the warn threshold of [{}]", - timeSinceProgress, - noProgressTowardsBalanceThreshold - ); - final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); - allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); - try { - for (final var shardRouting : immovableShardsSample) { - final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); - for (final var nodeId : assignment.nodeIds()) { - final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); - logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); - } - } - } finally { - allocation.setDebugMode(originalDebugMode); - } - }); + private void logDecisionsForImmovableShardsOverThreshold() { + final long currentTimeMillis = timeProvider.relativeTimeInMillis(); + immovableShards.forEach((ObjectLongProcedure) (shardRouting, immovableSinceMillis) -> { + final long immovableDurationMs = currentTimeMillis - immovableSinceMillis; + if (immovableDurationMs > immovableShardThreshold.millis()) { + logImmovableShardDetails(shardRouting, TimeValue.timeValueMillis(immovableDurationMs)); + } + }); + } + + private void logImmovableShardDetails(ShardRouting shardRouting, TimeValue immovableDuration) { + final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); + allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); + try { + final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); + logger.warn("Shard {} has been immovable for {}", shardRouting.shardId(), immovableDuration); + for (final var nodeId : assignment.nodeIds()) { + final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); + logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); + } + } finally { + allocation.setDebugMode(originalDebugMode); } } + private long oldestImmovableShardTimestamp() { + long[] oldestTimestamp = { Long.MAX_VALUE }; + immovableShards.values().forEach((LongProcedure) lc -> { + if (lc < oldestTimestamp[0]) { + oldestTimestamp[0] = lc; + } + }); + return oldestTimestamp[0]; + } + private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; @@ -711,24 +717,25 @@ private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undes } private DiscoveryNode findRelocationTarget(final ShardRouting shardRouting, Set desiredNodeIds) { - final var moveDecision = findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanAllocate); + final var moveDecision = findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanAllocate).chosenNode(); if (moveDecision != null) { return moveDecision; } final var shardsOnReplacedNode = allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId(), REPLACE); if (shardsOnReplacedNode) { - return findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanForceAllocateForVacate); + return findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanForceAllocateForVacate).chosenNode(); } return null; } - private DiscoveryNode findRelocationTarget( + private DecisionAndResult findRelocationTarget( ShardRouting shardRouting, Set desiredNodeIds, BiFunction canAllocateDecider ) { DiscoveryNode chosenNode = null; + Decision bestDecision = null; for (final var nodeId : desiredNodeIds) { // TODO consider ignored nodes here too? if (nodeId.equals(shardRouting.currentNodeId())) { @@ -746,6 +753,7 @@ private DiscoveryNode findRelocationTarget( // better to offload shards first. if (decision.type() == Decision.Type.YES) { chosenNode = node.node(); + bestDecision = decision; // As soon as we get any YES, we return it. break; } else if (decision.type() == Decision.Type.NOT_PREFERRED && chosenNode == null) { @@ -754,12 +762,15 @@ private DiscoveryNode findRelocationTarget( // choose and the shard cannot remain where it is, we accept not-preferred. NOT_PREFERRED is essentially a YES for // reconciliation. chosenNode = node.node(); + bestDecision = decision; } } - return chosenNode; + return new DecisionAndResult(bestDecision, chosenNode); } + private record DecisionAndResult(@Nullable Decision bestDecision, @Nullable DiscoveryNode chosenNode) {} + private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { assert target != null : "Target node is not found"; return allocation.deciders().canAllocate(shardRouting, target, allocation); diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 711b68bd3abe1..72f427fea6f22 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -235,7 +235,7 @@ public void apply(Settings value, Settings current, Settings previous) { DesiredBalanceComputer.MAX_BALANCE_COMPUTATION_TIME_DURING_INDEX_CREATION_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - DesiredBalanceReconciler.NO_PROGRESS_TOWARDS_BALANCE_LOG_THRESHOLD_SETTING, + DesiredBalanceReconciler.IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE, From 14772abd5d41772f6f97fdcdee5fdcd9d35e4b60 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 11:54:26 +1100 Subject: [PATCH 03/44] Naming, initial delay --- .../allocator/DesiredBalanceReconciler.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 8669e767a058a..8dfd8bb8650a3 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -96,7 +96,7 @@ public class DesiredBalanceReconciler { private final TimeProvider timeProvider; private final AtomicLong lastProgressTowardsBalanceTimestampMillis = new AtomicLong(0L); private final FrequencyCappedAction undesiredAllocationLogInterval; - private final FrequencyCappedAction noProgressTowardsBalanceLogInterval; + private final FrequencyCappedAction immovableShardsLogInterval; private double undesiredAllocationsLogThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); @@ -106,14 +106,11 @@ public class DesiredBalanceReconciler { public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); - this.noProgressTowardsBalanceLogInterval = new FrequencyCappedAction( - timeProvider::relativeTimeInMillis, - TimeValue.timeValueMinutes(5) - ); + this.immovableShardsLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); this.lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { this.undesiredAllocationLogInterval.setMinInterval(value); - this.noProgressTowardsBalanceLogInterval.setMinInterval(value); + this.immovableShardsLogInterval.setMinInterval(value); }); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, @@ -660,7 +657,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { private void maybeLogImmovableShardsWarning() { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); if (currentTimeMillis - oldestImmovableShardTimestamp() > immovableShardThreshold.millis()) { - noProgressTowardsBalanceLogInterval.maybeExecute(this::logDecisionsForImmovableShardsOverThreshold); + immovableShardsLogInterval.maybeExecute(this::logDecisionsForImmovableShardsOverThreshold); } } From d8186d9c0b5af24e549643a5e53ff0eff885b42f Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 11:55:23 +1100 Subject: [PATCH 04/44] Remove dead code --- .../allocation/allocator/DesiredBalanceReconciler.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 8dfd8bb8650a3..4b6eeb5be457c 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -43,7 +43,6 @@ import java.util.Comparator; import java.util.Iterator; import java.util.Set; -import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiFunction; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -94,7 +93,6 @@ public class DesiredBalanceReconciler { ); private final TimeProvider timeProvider; - private final AtomicLong lastProgressTowardsBalanceTimestampMillis = new AtomicLong(0L); private final FrequencyCappedAction undesiredAllocationLogInterval; private final FrequencyCappedAction immovableShardsLogInterval; private double undesiredAllocationsLogThreshold; @@ -107,7 +105,6 @@ public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider ti this.timeProvider = timeProvider; this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); this.immovableShardsLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); - this.lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { this.undesiredAllocationLogInterval.setMinInterval(value); this.immovableShardsLogInterval.setMinInterval(value); @@ -613,7 +610,6 @@ private DesiredBalanceMetrics.AllocationStats balance() { shardRouting.currentNodeId(), rebalanceTargetNode.getId() ); - lastProgressTowardsBalanceTimestampMillis.set(timeProvider.relativeTimeInMillis()); routingNodes.relocateShard( shardRouting, From ad5c02877cd371c45c4d4aabf680770f1f21fdf3 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 11:59:00 +1100 Subject: [PATCH 05/44] Tidy --- .../allocation/allocator/DesiredBalanceReconciler.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 4b6eeb5be457c..44bb9d4460b61 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -601,19 +601,19 @@ private DesiredBalanceMetrics.AllocationStats balance() { } final var rebalanceDecision = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); - final var rebalanceTargetNode = rebalanceDecision.chosenNode(); - if (rebalanceTargetNode != null) { + final var rebalanceTarget = rebalanceDecision.chosenNode(); + if (rebalanceTarget != null) { immovableShards.remove(shardRouting); logger.debug( "Rebalancing shard {} from {} to {}", shardRouting.shardId(), shardRouting.currentNodeId(), - rebalanceTargetNode.getId() + rebalanceTarget.getId() ); routingNodes.relocateShard( shardRouting, - rebalanceTargetNode.getId(), + rebalanceTarget.getId(), allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), "rebalance", allocation.changes() @@ -621,7 +621,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { iterator.dePrioritizeNode(shardRouting.currentNodeId()); moveOrdering.recordAllocation(shardRouting.currentNodeId()); } else { - // Start tracking this shard as immovable if we are not already + // Start tracking this shard as immovable if we are not already, we're not interested in shards that are THROTTLED if (rebalanceDecision.bestDecision() == null || rebalanceDecision.bestDecision() == Decision.NO) { if (immovableShards.containsKey(shardRouting) == false) { immovableShards.put(shardRouting, timeProvider.relativeTimeInMillis()); From 7a98137de70fa47a839359431125ca30680f970e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 12:01:43 +1100 Subject: [PATCH 06/44] Javadoc --- .../allocation/allocator/DesiredBalanceReconciler.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 44bb9d4460b61..ba226c70f2eb8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -762,6 +762,12 @@ private DecisionAndResult findRelocationTarget( return new DecisionAndResult(bestDecision, chosenNode); } + /** + * An allocation decision result + * + * @param bestDecision The best decision we saw from the nodes attempted (can be null if no nodes were attempted) + * @param chosenNode The node to allocate the shard to (can be null if no suitable nodes were found) + */ private record DecisionAndResult(@Nullable Decision bestDecision, @Nullable DiscoveryNode chosenNode) {} private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { From e84efa6911cb558429ee9026bc16fbc1dc025562 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 24 Oct 2025 12:05:05 +1100 Subject: [PATCH 07/44] Clear immovable shard in moveShards --- .../routing/allocation/allocator/DesiredBalanceReconciler.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index ba226c70f2eb8..19661efef2f0a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -534,6 +534,7 @@ private void moveShards() { final var moveTarget = findRelocationTarget(shardRouting, assignment.nodeIds()); if (moveTarget != null) { logger.debug("Moving shard {} from {} to {}", shardRouting.shardId(), shardRouting.currentNodeId(), moveTarget.getId()); + immovableShards.remove(shardRouting); routingNodes.relocateShard( shardRouting, moveTarget.getId(), From 4194739c83bdea2eb4629b5dcad81184dd5a03bd Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:04:55 +1100 Subject: [PATCH 08/44] Try tracking undesired state in shard routing --- .../cluster/routing/RoutingNodes.java | 19 ++++ .../cluster/routing/ShardRouting.java | 94 ++++++++++++++++--- .../allocator/DesiredBalanceReconciler.java | 87 ++++++----------- .../routing/IndexRoutingTableTests.java | 3 +- .../cluster/routing/ShardRoutingTests.java | 30 ++++-- .../cluster/routing/ShardRoutingHelper.java | 6 +- .../cluster/routing/TestShardRouting.java | 18 +++- 7 files changed, 170 insertions(+), 87 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index 12e10fc5ae04e..61b4004d298c3 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -892,6 +892,25 @@ public Map> getAssignedShards() { return Collections.unmodifiableMap(assignedShards); } + /** + * Record that this shard is in an undesired location if it's not already marked as such + */ + public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndesiredTime) { + final var withUndesiredInfo = originalShard.updateUndesired(becameUndesiredTime); + if (withUndesiredInfo != originalShard) { + nodesToShards.get(originalShard.currentNodeId()).update(originalShard, withUndesiredInfo); + return withUndesiredInfo; + } + return originalShard; + } + + public void clearUndesired(ShardRouting originalShard) { + final var withoutUndesiredInfo = originalShard.clearUndesired(); + if (withoutUndesiredInfo != originalShard) { + nodesToShards.get(originalShard.currentNodeId()).update(originalShard, withoutUndesiredInfo); + } + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index 2ce349e2d3b61..40ff3157d91d0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -42,6 +42,7 @@ public final class ShardRouting implements Writeable, ToXContentObject { * Used if shard size is not available */ public static final long UNAVAILABLE_EXPECTED_SHARD_SIZE = -1; + public static final long NOT_UNDESIRED_TIMESTAMP = Long.MAX_VALUE; private static final TransportVersion EXPECTED_SHARD_SIZE_FOR_STARTED_VERSION = TransportVersions.V_8_5_0; private static final TransportVersion RELOCATION_FAILURE_INFO_VERSION = TransportVersions.V_8_6_0; @@ -67,6 +68,7 @@ public final class ShardRouting implements Writeable, ToXContentObject { @Nullable private final ShardRouting targetRelocatingShard; private final Role role; + private final long becameUndesiredTime; /** * A constructor to internally create shard routing instances, note, the internal flag should only be set to true @@ -83,7 +85,8 @@ public final class ShardRouting implements Writeable, ToXContentObject { RelocationFailureInfo relocationFailureInfo, AllocationId allocationId, long expectedShardSize, - Role role + Role role, + long becameUndesiredTime ) { this.shardId = shardId; this.currentNodeId = currentNodeId; @@ -97,6 +100,7 @@ public final class ShardRouting implements Writeable, ToXContentObject { this.expectedShardSize = expectedShardSize; this.role = role; this.targetRelocatingShard = initializeTargetRelocatingShard(); + this.becameUndesiredTime = becameUndesiredTime; assert assertConsistent(); } @@ -150,7 +154,8 @@ private ShardRouting initializeTargetRelocatingShard() { RelocationFailureInfo.NO_FAILURES, AllocationId.newTargetRelocation(allocationId), expectedShardSize, - role + role, + NOT_UNDESIRED_TIMESTAMP // Assume we didn't relocate to another undesired allocation (?) ); } else { return null; @@ -178,7 +183,8 @@ public static ShardRouting newUnassigned( RelocationFailureInfo.NO_FAILURES, null, UNAVAILABLE_EXPECTED_SHARD_SIZE, - role + role, + NOT_UNDESIRED_TIMESTAMP ); } @@ -362,6 +368,7 @@ public ShardRouting(ShardId shardId, StreamInput in) throws IOException { role = Role.DEFAULT; } targetRelocatingShard = initializeTargetRelocatingShard(); + becameUndesiredTime = NOT_UNDESIRED_TIMESTAMP; // TODO: serialize } public ShardRouting(StreamInput in) throws IOException { @@ -422,10 +429,59 @@ public ShardRouting updateUnassigned(UnassignedInfo unassignedInfo, RecoverySour relocationFailureInfo, allocationId, expectedShardSize, - role + role, + becameUndesiredTime + ); + } + + public ShardRouting updateUndesired(long becameUndesiredTime) { + if (this.becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP) { + return this; + } + return new ShardRouting( + shardId, + currentNodeId, + relocatingNodeId, + primary, + state, + recoverySource, + unassignedInfo, + relocationFailureInfo, + allocationId, + expectedShardSize, + role, + becameUndesiredTime ); } + public ShardRouting clearUndesired() { + if (becameUndesiredTime == NOT_UNDESIRED_TIMESTAMP) { + return this; + } + return new ShardRouting( + shardId, + currentNodeId, + relocatingNodeId, + primary, + state, + recoverySource, + unassignedInfo, + relocationFailureInfo, + allocationId, + expectedShardSize, + role, + NOT_UNDESIRED_TIMESTAMP + ); + } + + public boolean isInUndesiredAllocation() { + return becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP; + } + + public long becameUndesiredTime() { + return becameUndesiredTime; + } + public ShardRouting updateRelocationFailure(RelocationFailureInfo relocationFailureInfo) { assert this.relocationFailureInfo != null : "can only update relocation failure info info if it is already set"; return new ShardRouting( @@ -439,7 +495,8 @@ public ShardRouting updateRelocationFailure(RelocationFailureInfo relocationFail relocationFailureInfo, allocationId, expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -469,7 +526,8 @@ public ShardRouting moveToUnassigned(UnassignedInfo unassignedInfo) { RelocationFailureInfo.NO_FAILURES, null, UNAVAILABLE_EXPECTED_SHARD_SIZE, - role + role, + becameUndesiredTime ); } @@ -498,7 +556,8 @@ public ShardRouting initialize(String nodeId, @Nullable String existingAllocatio RelocationFailureInfo.NO_FAILURES, allocationId, expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -520,7 +579,8 @@ public ShardRouting relocate(String relocatingNodeId, long expectedShardSize) { relocationFailureInfo, AllocationId.newRelocation(allocationId), expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -543,7 +603,8 @@ public ShardRouting cancelRelocation() { relocationFailureInfo.incFailedRelocations(), AllocationId.cancelRelocation(allocationId), UNAVAILABLE_EXPECTED_SHARD_SIZE, - role + role, + becameUndesiredTime ); } @@ -568,7 +629,8 @@ public ShardRouting removeRelocationSource() { relocationFailureInfo, AllocationId.finishRelocation(allocationId), expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -590,7 +652,8 @@ public ShardRouting reinitializeReplicaShard() { relocationFailureInfo, AllocationId.newInitializing(), expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -618,7 +681,8 @@ public ShardRouting moveToStarted(long expectedShardSize) { RelocationFailureInfo.NO_FAILURES, allocationId, expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -643,7 +707,8 @@ public ShardRouting moveActiveReplicaToPrimary() { relocationFailureInfo, allocationId, expectedShardSize, - role + role, + becameUndesiredTime ); } @@ -668,7 +733,8 @@ public ShardRouting moveUnassignedFromPrimary() { relocationFailureInfo, allocationId, expectedShardSize, - role + role, + becameUndesiredTime ); } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 19661efef2f0a..944d3e37493f8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -11,8 +11,6 @@ import com.carrotsearch.hppc.ObjectLongHashMap; import com.carrotsearch.hppc.ObjectLongMap; -import com.carrotsearch.hppc.procedures.LongProcedure; -import com.carrotsearch.hppc.procedures.ObjectLongProcedure; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -33,7 +31,6 @@ import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.time.TimeProvider; -import org.elasticsearch.core.Nullable; import org.elasticsearch.core.TimeValue; import org.elasticsearch.core.Tuple; import org.elasticsearch.gateway.PriorityComparator; @@ -99,7 +96,6 @@ public class DesiredBalanceReconciler { private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); private volatile TimeValue immovableShardThreshold; - private final ObjectLongHashMap immovableShards = new ObjectLongHashMap<>(); public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; @@ -134,7 +130,6 @@ public DesiredBalanceMetrics.AllocationStats reconcile(DesiredBalance desiredBal public void clear() { allocationOrdering.clear(); moveOrdering.clear(); - immovableShards.clear(); } /** @@ -499,7 +494,7 @@ private void moveShards() { // Iterate over all started shards and check if they can remain. In the presence of throttling shard movements, // the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the shards. for (final var iterator = OrderedShardsIterator.createForNecessaryMoves(allocation, moveOrdering); iterator.hasNext();) { - final var shardRouting = iterator.next(); + var shardRouting = iterator.next(); if (shardRouting.started() == false) { // can only move started shards @@ -514,8 +509,10 @@ private void moveShards() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node + routingNodes.clearUndesired(shardRouting); continue; } + shardRouting = routingNodes.markAsUndesired(shardRouting, timeProvider.relativeTimeInMillis()); if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { // cannot allocate anywhere, no point in looking for a target node @@ -534,7 +531,6 @@ private void moveShards() { final var moveTarget = findRelocationTarget(shardRouting, assignment.nodeIds()); if (moveTarget != null) { logger.debug("Moving shard {} from {} to {}", shardRouting.shardId(), shardRouting.currentNodeId(), moveTarget.getId()); - immovableShards.remove(shardRouting); routingNodes.relocateShard( shardRouting, moveTarget.getId(), @@ -558,8 +554,9 @@ private DesiredBalanceMetrics.AllocationStats balance() { // Iterate over all started shards and try to move any which are on undesired nodes. In the presence of throttling shard // movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the // shards. + var earliestUndesiredTimestamp = Long.MAX_VALUE; for (final var iterator = OrderedShardsIterator.createForBalancing(allocation, moveOrdering); iterator.hasNext();) { - final var shardRouting = iterator.next(); + var shardRouting = iterator.next(); totalAllocations++; totalAllocationsByRole.addTo(shardRouting.role(), 1); @@ -577,8 +574,12 @@ private DesiredBalanceMetrics.AllocationStats balance() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node + routingNodes.clearUndesired(shardRouting); continue; } + shardRouting = routingNodes.markAsUndesired(shardRouting, timeProvider.relativeTimeInMillis()); + assert shardRouting.becameUndesiredTime() != ShardRouting.NOT_UNDESIRED_TIMESTAMP; + earliestUndesiredTimestamp = Math.min(earliestUndesiredTimestamp, shardRouting.becameUndesiredTime()); if (allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId()) == false) { // shard is not on a shutting down node, nor is it on a desired node per the previous check. @@ -601,10 +602,8 @@ private DesiredBalanceMetrics.AllocationStats balance() { continue; } - final var rebalanceDecision = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); - final var rebalanceTarget = rebalanceDecision.chosenNode(); + final var rebalanceTarget = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); if (rebalanceTarget != null) { - immovableShards.remove(shardRouting); logger.debug( "Rebalancing shard {} from {} to {}", shardRouting.shardId(), @@ -621,17 +620,10 @@ private DesiredBalanceMetrics.AllocationStats balance() { ); iterator.dePrioritizeNode(shardRouting.currentNodeId()); moveOrdering.recordAllocation(shardRouting.currentNodeId()); - } else { - // Start tracking this shard as immovable if we are not already, we're not interested in shards that are THROTTLED - if (rebalanceDecision.bestDecision() == null || rebalanceDecision.bestDecision() == Decision.NO) { - if (immovableShards.containsKey(shardRouting) == false) { - immovableShards.put(shardRouting, timeProvider.relativeTimeInMillis()); - } - } } } - maybeLogImmovableShardsWarning(); + maybeLogUndesiredShardsWarning(earliestUndesiredTimestamp); maybeLogUndesiredAllocationsWarning(totalAllocations, undesiredAllocationsExcludingShuttingDownNodes, routingNodes.size()); return new DesiredBalanceMetrics.AllocationStats( unassignedShards, @@ -649,31 +641,35 @@ private DesiredBalanceMetrics.AllocationStats balance() { } /** - * If there are shards that have been immovable for a long time, log a warning + * If there are shards that have been in undesirable allocations for a long time, log a warning */ - private void maybeLogImmovableShardsWarning() { + private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - if (currentTimeMillis - oldestImmovableShardTimestamp() > immovableShardThreshold.millis()) { - immovableShardsLogInterval.maybeExecute(this::logDecisionsForImmovableShardsOverThreshold); + if (currentTimeMillis - earliestUndesiredTimestamp > immovableShardThreshold.millis()) { + immovableShardsLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); } } - private void logDecisionsForImmovableShardsOverThreshold() { + private void logDecisionsForUndesiredShardsOverThreshold() { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - immovableShards.forEach((ObjectLongProcedure) (shardRouting, immovableSinceMillis) -> { - final long immovableDurationMs = currentTimeMillis - immovableSinceMillis; - if (immovableDurationMs > immovableShardThreshold.millis()) { - logImmovableShardDetails(shardRouting, TimeValue.timeValueMillis(immovableDurationMs)); - } + routingNodes.forEach(routingNode -> { + routingNode.forEach(shardRouting -> { + if (shardRouting.isInUndesiredAllocation()) { + final long undesiredDurationMs = currentTimeMillis - shardRouting.becameUndesiredTime(); + if (undesiredDurationMs > immovableShardThreshold.millis()) { + logUndesiredShardDetails(shardRouting, TimeValue.timeValueMillis(undesiredDurationMs)); + } + } + }); }); } - private void logImmovableShardDetails(ShardRouting shardRouting, TimeValue immovableDuration) { + private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue immovableDuration) { final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); try { final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); - logger.warn("Shard {} has been immovable for {}", shardRouting.shardId(), immovableDuration); + logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), immovableDuration); for (final var nodeId : assignment.nodeIds()) { final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); @@ -683,16 +679,6 @@ private void logImmovableShardDetails(ShardRouting shardRouting, TimeValue immov } } - private long oldestImmovableShardTimestamp() { - long[] oldestTimestamp = { Long.MAX_VALUE }; - immovableShards.values().forEach((LongProcedure) lc -> { - if (lc < oldestTimestamp[0]) { - oldestTimestamp[0] = lc; - } - }); - return oldestTimestamp[0]; - } - private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; @@ -711,25 +697,24 @@ private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undes } private DiscoveryNode findRelocationTarget(final ShardRouting shardRouting, Set desiredNodeIds) { - final var moveDecision = findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanAllocate).chosenNode(); + final var moveDecision = findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanAllocate); if (moveDecision != null) { return moveDecision; } final var shardsOnReplacedNode = allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId(), REPLACE); if (shardsOnReplacedNode) { - return findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanForceAllocateForVacate).chosenNode(); + return findRelocationTarget(shardRouting, desiredNodeIds, this::decideCanForceAllocateForVacate); } return null; } - private DecisionAndResult findRelocationTarget( + private DiscoveryNode findRelocationTarget( ShardRouting shardRouting, Set desiredNodeIds, BiFunction canAllocateDecider ) { DiscoveryNode chosenNode = null; - Decision bestDecision = null; for (final var nodeId : desiredNodeIds) { // TODO consider ignored nodes here too? if (nodeId.equals(shardRouting.currentNodeId())) { @@ -747,7 +732,6 @@ private DecisionAndResult findRelocationTarget( // better to offload shards first. if (decision.type() == Decision.Type.YES) { chosenNode = node.node(); - bestDecision = decision; // As soon as we get any YES, we return it. break; } else if (decision.type() == Decision.Type.NOT_PREFERRED && chosenNode == null) { @@ -756,21 +740,12 @@ private DecisionAndResult findRelocationTarget( // choose and the shard cannot remain where it is, we accept not-preferred. NOT_PREFERRED is essentially a YES for // reconciliation. chosenNode = node.node(); - bestDecision = decision; } } - return new DecisionAndResult(bestDecision, chosenNode); + return chosenNode; } - /** - * An allocation decision result - * - * @param bestDecision The best decision we saw from the nodes attempted (can be null if no nodes were attempted) - * @param chosenNode The node to allocate the shard to (can be null if no suitable nodes were found) - */ - private record DecisionAndResult(@Nullable Decision bestDecision, @Nullable DiscoveryNode chosenNode) {} - private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { assert target != null : "Target node is not found"; return allocation.deciders().canAllocate(shardRouting, target, allocation); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java index 37d23cc855ec4..dcf5ae461be99 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java @@ -146,7 +146,8 @@ private ShardRouting getShard(ShardId shardId, boolean isPrimary, ShardRoutingSt TestShardRouting.buildRelocationFailureInfo(state), TestShardRouting.buildAllocationId(state), randomLongBetween(-1, 1024), - role + role, + randomBoolean() ? ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE : randomLong() ); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java index 23fdb47619ef8..61eb50730366f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java @@ -51,7 +51,8 @@ protected ShardRouting createTestInstance() { TestShardRouting.buildRelocationFailureInfo(state), TestShardRouting.buildAllocationId(state), randomLongBetween(-1, 1024), - randomFrom(ShardRouting.Role.DEFAULT, (primary ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY)) + randomFrom(ShardRouting.Role.DEFAULT, (primary ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY)), + randomBoolean() ? ShardRouting.NOT_UNDESIRED_TIMESTAMP : randomLong() ); } @@ -83,7 +84,8 @@ private static ShardRouting mutateShardId(ShardRouting instance) { instance.relocationFailureInfo(), instance.allocationId(), instance.getExpectedShardSize(), - instance.role() + instance.role(), + instance.becameUndesiredTime() ); } @@ -115,7 +117,8 @@ private static ShardRouting mutateState(ShardRouting instance) { ); }, instance.getExpectedShardSize(), - instance.role() + instance.role(), + instance.becameUndesiredTime() ); } @@ -131,7 +134,8 @@ private static ShardRouting mutateCurrentNodeId(ShardRouting instance) { instance.relocationFailureInfo(), instance.allocationId(), instance.getExpectedShardSize(), - instance.role() + instance.role(), + instance.becameUndesiredTime() ); } @@ -153,7 +157,8 @@ private static ShardRouting mutateRole(ShardRouting instance) { ShardRouting.Role.DEFAULT, (instance.primary() ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY) ) - ) + ), + instance.becameUndesiredTime() ); } @@ -274,7 +279,8 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role() + otherRouting.role(), + otherRouting.becameUndesiredTime() ); break; case 1: @@ -290,7 +296,8 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role() + otherRouting.role(), + otherRouting.becameUndesiredTime() ); break; case 2: @@ -309,7 +316,8 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role() + otherRouting.role(), + otherRouting.becameUndesiredTime() ); } break; @@ -329,7 +337,8 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role() + otherRouting.role(), + otherRouting.becameUndesiredTime() ); } break; @@ -354,7 +363,8 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role() + otherRouting.role(), + otherRouting.becameUndesiredTime() ); } break; diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java b/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java index a536112e93a1f..59161f5c688fd 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java @@ -52,7 +52,8 @@ public static ShardRouting initWithSameId(ShardRouting copy, RecoverySource reco RelocationFailureInfo.NO_FAILURES, copy.allocationId(), copy.getExpectedShardSize(), - copy.role() + copy.role(), + copy.becameUndesiredTime() ); } @@ -72,7 +73,8 @@ public static ShardRouting newWithRestoreSource(ShardRouting routing, SnapshotRe routing.relocationFailureInfo(), routing.allocationId(), routing.getExpectedShardSize(), - routing.role() + routing.role(), + routing.becameUndesiredTime() ); } } diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java b/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java index 84e1dd532e2b7..2dee946f6fa43 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java @@ -63,6 +63,7 @@ public static class Builder { private AllocationId allocationId; private Long expectedShardSize; private ShardRouting.Role role; + private Long becameUndesired; public Builder(ShardId shardId, String currentNodeId, boolean primary, ShardRoutingState state) { this.shardId = shardId; @@ -106,6 +107,11 @@ public Builder withRole(ShardRouting.Role role) { return this; } + public Builder withBecameUndesired(long becameUndesired) { + this.becameUndesired = becameUndesired; + return this; + } + public ShardRouting build() { return new ShardRouting( shardId, @@ -118,7 +124,8 @@ public ShardRouting build() { relocationFailureInfo != null ? relocationFailureInfo : buildRelocationFailureInfo(state), allocationId != null ? allocationId : buildAllocationId(state), expectedShardSize != null ? expectedShardSize : ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role != null ? role : ShardRouting.Role.DEFAULT + role != null ? role : ShardRouting.Role.DEFAULT, + becameUndesired != null ? becameUndesired : ShardRouting.NOT_UNDESIRED_TIMESTAMP ); } } @@ -160,7 +167,8 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role + role, + ShardRouting.NOT_UNDESIRED_TIMESTAMP ); } @@ -184,7 +192,8 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role + role, + ShardRouting.NOT_UNDESIRED_TIMESTAMP ); } @@ -223,7 +232,8 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - ShardRouting.Role.DEFAULT + ShardRouting.Role.DEFAULT, + ShardRouting.NOT_UNDESIRED_TIMESTAMP ); } From f050316c8b7ec2dcd4427045a3dd673b1a347f9f Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:10:11 +1100 Subject: [PATCH 09/44] Fix setting names --- .../allocator/DesiredBalanceReconciler.java | 11 +++++++---- .../common/settings/ClusterSettings.java | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 944d3e37493f8..90c43641d28ec 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -80,10 +80,10 @@ public class DesiredBalanceReconciler { ); /** - * Warning logs will be periodically written if we see a shard that's been unable to be allocated for this long + * Warning logs will be periodically written if we see a shard that's been in an undesired allocation for this long */ - public static final Setting IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING = Setting.timeSetting( - "cluster.routing.allocation.desired_balance.immovable_shard_logging.threshold", + public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING = Setting.timeSetting( + "cluster.routing.allocation.desired_balance.undesired_duration_logging.threshold", TimeValue.timeValueMinutes(5), Setting.Property.Dynamic, Setting.Property.NodeScope @@ -109,7 +109,10 @@ public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider ti UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, value -> this.undesiredAllocationsLogThreshold = value ); - clusterSettings.initializeAndWatch(IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING, value -> this.immovableShardThreshold = value); + clusterSettings.initializeAndWatch( + UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, + value -> this.immovableShardThreshold = value + ); } /** diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 72f427fea6f22..25880cbbfa453 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -235,7 +235,7 @@ public void apply(Settings value, Settings current, Settings previous) { DesiredBalanceComputer.MAX_BALANCE_COMPUTATION_TIME_DURING_INDEX_CREATION_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - DesiredBalanceReconciler.IMMOVABLE_SHARD_LOG_THRESHOLD_SETTING, + DesiredBalanceReconciler.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE, From c7b14fa8c30dd1b6689932370c6df44b50598ceb Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:13:05 +1100 Subject: [PATCH 10/44] Naming --- .../allocator/DesiredBalanceReconciler.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 90c43641d28ec..8ca6e3159df75 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -92,10 +92,10 @@ public class DesiredBalanceReconciler { private final TimeProvider timeProvider; private final FrequencyCappedAction undesiredAllocationLogInterval; private final FrequencyCappedAction immovableShardsLogInterval; - private double undesiredAllocationsLogThreshold; + private double undesiredAllocationsPercentageLoggingThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); - private volatile TimeValue immovableShardThreshold; + private volatile TimeValue undesiredAllocationDurationLoggingThreshold; public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; @@ -107,11 +107,11 @@ public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider ti }); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - value -> this.undesiredAllocationsLogThreshold = value + value -> this.undesiredAllocationsPercentageLoggingThreshold = value ); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, - value -> this.immovableShardThreshold = value + value -> this.undesiredAllocationDurationLoggingThreshold = value ); } @@ -648,7 +648,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { */ private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - if (currentTimeMillis - earliestUndesiredTimestamp > immovableShardThreshold.millis()) { + if (currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { immovableShardsLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); } } @@ -659,7 +659,7 @@ private void logDecisionsForUndesiredShardsOverThreshold() { routingNode.forEach(shardRouting -> { if (shardRouting.isInUndesiredAllocation()) { final long undesiredDurationMs = currentTimeMillis - shardRouting.becameUndesiredTime(); - if (undesiredDurationMs > immovableShardThreshold.millis()) { + if (undesiredDurationMs > undesiredAllocationDurationLoggingThreshold.millis()) { logUndesiredShardDetails(shardRouting, TimeValue.timeValueMillis(undesiredDurationMs)); } } @@ -685,7 +685,8 @@ private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue immov private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; - final boolean warningThresholdReached = undesiredAllocations > undesiredAllocationsLogThreshold * totalAllocations; + final boolean warningThresholdReached = undesiredAllocations > undesiredAllocationsPercentageLoggingThreshold + * totalAllocations; if (totalAllocations > 0 && nonEmptyRelocationBacklog && warningThresholdReached) { undesiredAllocationLogInterval.maybeExecute( () -> logger.warn( @@ -693,7 +694,7 @@ private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undes Strings.format1Decimals(100.0 * undesiredAllocations / totalAllocations, "%"), undesiredAllocations, totalAllocations, - Strings.format1Decimals(100.0 * undesiredAllocationsLogThreshold, "%") + Strings.format1Decimals(100.0 * undesiredAllocationsPercentageLoggingThreshold, "%") ) ); } From 5cf44fc04851272c3318180a8105a9ddac469070 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:15:32 +1100 Subject: [PATCH 11/44] Naming --- .../allocation/allocator/DesiredBalanceReconciler.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 8ca6e3159df75..f75ce19b75999 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -91,7 +91,7 @@ public class DesiredBalanceReconciler { private final TimeProvider timeProvider; private final FrequencyCappedAction undesiredAllocationLogInterval; - private final FrequencyCappedAction immovableShardsLogInterval; + private final FrequencyCappedAction undesiredAllocationDurationLogInterval; private double undesiredAllocationsPercentageLoggingThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); @@ -100,10 +100,10 @@ public class DesiredBalanceReconciler { public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); - this.immovableShardsLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); + this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { this.undesiredAllocationLogInterval.setMinInterval(value); - this.immovableShardsLogInterval.setMinInterval(value); + this.undesiredAllocationDurationLogInterval.setMinInterval(value); }); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, @@ -649,7 +649,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); if (currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { - immovableShardsLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); + undesiredAllocationDurationLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); } } From 1472ec2625b348474bd1e369c4e1acebe366835a Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:33:17 +1100 Subject: [PATCH 12/44] ShardRouting#equals/hashCode --- .../java/org/elasticsearch/cluster/routing/ShardRouting.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index 40ff3157d91d0..cf9232964e68b 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -878,7 +878,8 @@ public boolean equals(Object o) { ShardRouting that = (ShardRouting) o; return equalsIgnoringMetadata(that) && Objects.equals(unassignedInfo, that.unassignedInfo) - && Objects.equals(relocationFailureInfo, that.relocationFailureInfo); + && Objects.equals(relocationFailureInfo, that.relocationFailureInfo) + && becameUndesiredTime == that.becameUndesiredTime; } /** @@ -900,6 +901,7 @@ public int hashCode() { h = 31 * h + (allocationId != null ? allocationId.hashCode() : 0); h = 31 * h + (unassignedInfo != null ? unassignedInfo.hashCode() : 0); h = 31 * h + (relocationFailureInfo != null ? relocationFailureInfo.hashCode() : 0); + h = 31 * h + Long.hashCode(becameUndesiredTime); h = 31 * h + role.hashCode(); hashCode = h; } From 765236921889c6cb321f1c3e83b2e0d1f2f2957e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:34:53 +1100 Subject: [PATCH 13/44] Javadoc --- .../routing/allocation/allocator/DesiredBalanceReconciler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index f75ce19b75999..947fde370da13 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -644,7 +644,8 @@ private DesiredBalanceMetrics.AllocationStats balance() { } /** - * If there are shards that have been in undesirable allocations for a long time, log a warning + * If there are shards that have been in undesired allocations for longer than the configured + * threshold, log a warning */ private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); From 7996801b1fe8288abe70de3504ab552726f20d8e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:37:16 +1100 Subject: [PATCH 14/44] Fix logic to handle Long.MAX_VALUE --- .../routing/allocation/allocator/DesiredBalanceReconciler.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 947fde370da13..4fce6dd2aeded 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -649,7 +649,8 @@ private DesiredBalanceMetrics.AllocationStats balance() { */ private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - if (currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { + if (earliestUndesiredTimestamp < currentTimeMillis + && currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { undesiredAllocationDurationLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); } } From 2056c4e109ae82e863ffc2f1f07b36a7f1b9b361 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:40:28 +1100 Subject: [PATCH 15/44] javadoc --- .../java/org/elasticsearch/cluster/routing/RoutingNodes.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index 61b4004d298c3..c542ccabbab13 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -894,6 +894,8 @@ public Map> getAssignedShards() { /** * Record that this shard is in an undesired location if it's not already marked as such + * + * @return The updated {@link ShardRouting} or this if no change was made */ public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndesiredTime) { final var withUndesiredInfo = originalShard.updateUndesired(becameUndesiredTime); From 6e37d04da50c4ece6579243ae05be6df9163c050 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 16:42:45 +1100 Subject: [PATCH 16/44] javadoc --- .../java/org/elasticsearch/cluster/routing/RoutingNodes.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index c542ccabbab13..33ee23eb06b83 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -906,6 +906,11 @@ public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndes return originalShard; } + /** + * Clear any undesired allocation metadata from the specified shard + * + * @param originalShard The {@link ShardRouting} to clear undesired metadata from + */ public void clearUndesired(ShardRouting originalShard) { final var withoutUndesiredInfo = originalShard.clearUndesired(); if (withoutUndesiredInfo != originalShard) { From 904d597ed7edcb4bc43ea5f669ce6bd7aa10f5ec Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 17:35:28 +1100 Subject: [PATCH 17/44] Naming, RoutingNodes updates --- .../elasticsearch/cluster/routing/RoutingNodes.java | 8 +++++--- .../elasticsearch/cluster/routing/ShardRouting.java | 2 +- .../allocator/DesiredBalanceReconciler.java | 11 +++++++---- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index 33ee23eb06b83..fc6bc7c3652fd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -898,9 +898,10 @@ public Map> getAssignedShards() { * @return The updated {@link ShardRouting} or this if no change was made */ public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndesiredTime) { - final var withUndesiredInfo = originalShard.updateUndesired(becameUndesiredTime); + assert originalShard.started() : "Only started shards can be marked as being in an undesired allocation"; + final var withUndesiredInfo = originalShard.markAsUndesired(becameUndesiredTime); if (withUndesiredInfo != originalShard) { - nodesToShards.get(originalShard.currentNodeId()).update(originalShard, withUndesiredInfo); + updateAssigned(originalShard, withUndesiredInfo); return withUndesiredInfo; } return originalShard; @@ -912,9 +913,10 @@ public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndes * @param originalShard The {@link ShardRouting} to clear undesired metadata from */ public void clearUndesired(ShardRouting originalShard) { + assert originalShard.started() : "Only started shards can be marked as being in a desired allocation"; final var withoutUndesiredInfo = originalShard.clearUndesired(); if (withoutUndesiredInfo != originalShard) { - nodesToShards.get(originalShard.currentNodeId()).update(originalShard, withoutUndesiredInfo); + updateAssigned(originalShard, withoutUndesiredInfo); } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index cf9232964e68b..1314d86b1e345 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -434,7 +434,7 @@ public ShardRouting updateUnassigned(UnassignedInfo unassignedInfo, RecoverySour ); } - public ShardRouting updateUndesired(long becameUndesiredTime) { + public ShardRouting markAsUndesired(long becameUndesiredTime) { if (this.becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP) { return this; } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 4fce6dd2aeded..862e36c181cfe 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -90,7 +90,7 @@ public class DesiredBalanceReconciler { ); private final TimeProvider timeProvider; - private final FrequencyCappedAction undesiredAllocationLogInterval; + private final FrequencyCappedAction undesiredAllocationPercentageLogInterval; private final FrequencyCappedAction undesiredAllocationDurationLogInterval; private double undesiredAllocationsPercentageLoggingThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); @@ -99,10 +99,13 @@ public class DesiredBalanceReconciler { public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; - this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); + this.undesiredAllocationPercentageLogInterval = new FrequencyCappedAction( + timeProvider::relativeTimeInMillis, + TimeValue.timeValueMinutes(5) + ); this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { - this.undesiredAllocationLogInterval.setMinInterval(value); + this.undesiredAllocationPercentageLogInterval.setMinInterval(value); this.undesiredAllocationDurationLogInterval.setMinInterval(value); }); clusterSettings.initializeAndWatch( @@ -690,7 +693,7 @@ private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undes final boolean warningThresholdReached = undesiredAllocations > undesiredAllocationsPercentageLoggingThreshold * totalAllocations; if (totalAllocations > 0 && nonEmptyRelocationBacklog && warningThresholdReached) { - undesiredAllocationLogInterval.maybeExecute( + undesiredAllocationPercentageLogInterval.maybeExecute( () -> logger.warn( "[{}] of assigned shards ({}/{}) are not on their desired nodes, which exceeds the warn threshold of [{}]", Strings.format1Decimals(100.0 * undesiredAllocations / totalAllocations, "%"), From be13adbb80f34be53ada96fb62981a859a75988e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 17:40:58 +1100 Subject: [PATCH 18/44] Clear becameUndesiredTime when a shard goes unassigned --- .../java/org/elasticsearch/cluster/routing/ShardRouting.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index 1314d86b1e345..0d892183826f4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -527,7 +527,7 @@ public ShardRouting moveToUnassigned(UnassignedInfo unassignedInfo) { null, UNAVAILABLE_EXPECTED_SHARD_SIZE, role, - becameUndesiredTime + NOT_UNDESIRED_TIMESTAMP // Unassigned and undesired are tracked separately ); } From 5013031cd34b9fc6ed0c6f8fdd01ca6b7c1fd350 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 27 Oct 2025 17:45:20 +1100 Subject: [PATCH 19/44] Naming --- .../allocation/allocator/DesiredBalanceReconciler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 862e36c181cfe..de15beec6eb06 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -672,12 +672,12 @@ private void logDecisionsForUndesiredShardsOverThreshold() { }); } - private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue immovableDuration) { + private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue undesiredDuration) { final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); try { final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); - logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), immovableDuration); + logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration); for (final var nodeId : assignment.nodeIds()) { final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); From 047b4112e47060a7d2475b681f9234ce680a8781 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 29 Oct 2025 14:17:46 +1100 Subject: [PATCH 20/44] Mock time provision correctly --- .../DesiredBalanceReconcilerTests.java | 53 ++++++++++++++----- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index 7812df331e539..dcd18001d11a9 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -57,6 +57,7 @@ import org.elasticsearch.common.collect.ImmutableOpenMap; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.time.TimeProvider; import org.elasticsearch.common.util.Maps; import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; import org.elasticsearch.core.TimeValue; @@ -71,7 +72,6 @@ import org.elasticsearch.snapshots.SnapshotsInfoService; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.MockLog; -import org.elasticsearch.threadpool.ThreadPool; import org.junit.BeforeClass; import java.util.Comparator; @@ -92,6 +92,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; +import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.elasticsearch.cluster.ClusterInfo.shardIdentifierFromRouting; import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; @@ -107,8 +108,6 @@ import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.Matchers.oneOf; -import static org.mockito.Mockito.mock; -import static org.mockito.Mockito.when; public class DesiredBalanceReconcilerTests extends ESAllocationTestCase { @@ -1359,13 +1358,10 @@ public void testShouldLogOnTooManyUndesiredAllocations() { .routingTable(routingTableBuilder) .build(); - var threadPool = mock(ThreadPool.class); - final var timeInMillisSupplier = new AtomicLong(); - when(threadPool.relativeTimeInMillisSupplier()).thenReturn(timeInMillisSupplier::incrementAndGet); - - var reconciler = new DesiredBalanceReconciler(createBuiltInClusterSettings(), threadPool); + var timeProvider = new AdvancingTimeProvider(); + var reconciler = new DesiredBalanceReconciler(createBuiltInClusterSettings(), timeProvider); final long initialDelayInMillis = TimeValue.timeValueMinutes(5).getMillis(); - timeInMillisSupplier.addAndGet(randomLongBetween(initialDelayInMillis, 2 * initialDelayInMillis)); + timeProvider.advanceByMillis(randomLongBetween(initialDelayInMillis, 2 * initialDelayInMillis)); var expectedWarningMessage = "[100%] of assigned shards (" + shardCount @@ -1421,10 +1417,11 @@ private static void reconcile( DesiredBalance desiredBalance, AtomicReference allocationStatsAtomicReference ) { - final var threadPool = mock(ThreadPool.class); - when(threadPool.relativeTimeInMillisSupplier()).thenReturn(new AtomicLong()::incrementAndGet); allocationStatsAtomicReference.set( - new DesiredBalanceReconciler(createBuiltInClusterSettings(), threadPool).reconcile(desiredBalance, routingAllocation) + new DesiredBalanceReconciler(createBuiltInClusterSettings(), new AdvancingTimeProvider()).reconcile( + desiredBalance, + routingAllocation + ) ); } @@ -1583,4 +1580,36 @@ private static Settings throttleSettings() { .put(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), 1000) .build(); } + + /** + * A time-provider that advances each time it's asked the time + */ + private static class AdvancingTimeProvider implements TimeProvider { + + private final AtomicLong currentTimeMillis = new AtomicLong(); + + public void advanceByMillis(long milliseconds) { + currentTimeMillis.addAndGet(milliseconds); + } + + @Override + public long relativeTimeInMillis() { + return currentTimeMillis.incrementAndGet(); + } + + @Override + public long relativeTimeInNanos() { + return NANOSECONDS.toNanos(relativeTimeInMillis()); + } + + @Override + public long rawRelativeTimeInMillis() { + return relativeTimeInMillis(); + } + + @Override + public long absoluteTimeInMillis() { + throw new UnsupportedOperationException("not supported"); + } + } } From 9bd1e24b2a72e3c2166be9691df237e1ddfed5b2 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 29 Oct 2025 14:38:57 +1100 Subject: [PATCH 21/44] Serialise became_undesired_time --- .../cluster/routing/ShardRouting.java | 16 +++++++++++++++- .../shard_routing_includes_became_undesired.csv | 1 + .../resources/transport/upper_bounds/9.3.csv | 2 +- .../cluster/routing/ShardRoutingTests.java | 7 ++++++- 4 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index 0d892183826f4..28f1b38b90c92 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -45,6 +45,9 @@ public final class ShardRouting implements Writeable, ToXContentObject { public static final long NOT_UNDESIRED_TIMESTAMP = Long.MAX_VALUE; private static final TransportVersion EXPECTED_SHARD_SIZE_FOR_STARTED_VERSION = TransportVersions.V_8_5_0; private static final TransportVersion RELOCATION_FAILURE_INFO_VERSION = TransportVersions.V_8_6_0; + private static final TransportVersion BECAME_UNDESIRED_TIME_VERSION = TransportVersion.fromName( + "shard_routing_includes_became_undesired" + ); private final ShardId shardId; private final String currentNodeId; @@ -368,7 +371,11 @@ public ShardRouting(ShardId shardId, StreamInput in) throws IOException { role = Role.DEFAULT; } targetRelocatingShard = initializeTargetRelocatingShard(); - becameUndesiredTime = NOT_UNDESIRED_TIMESTAMP; // TODO: serialize + if (in.getTransportVersion().supports(BECAME_UNDESIRED_TIME_VERSION)) { + becameUndesiredTime = in.readLong(); + } else { + becameUndesiredTime = NOT_UNDESIRED_TIMESTAMP; + } } public ShardRouting(StreamInput in) throws IOException { @@ -407,6 +414,10 @@ public void writeToThin(StreamOutput out) throws IOException { Strings.format("cannot send role [%s] to node with version [%s]", role, out.getTransportVersion().toReleaseVersion()) ); } + + if (out.getTransportVersion().supports(BECAME_UNDESIRED_TIME_VERSION)) { + out.writeLong(becameUndesiredTime); + } } @Override @@ -945,6 +956,9 @@ public String shortSummary() { if (expectedShardSize != UNAVAILABLE_EXPECTED_SHARD_SIZE) { sb.append(", expected_shard_size[").append(expectedShardSize).append("]"); } + if (becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP) { + sb.append(", became_undesired_time[").append(becameUndesiredTime).append("]"); + } return sb.toString(); } diff --git a/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv b/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv new file mode 100644 index 0000000000000..a79975c00dea2 --- /dev/null +++ b/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv @@ -0,0 +1 @@ +9204000 diff --git a/server/src/main/resources/transport/upper_bounds/9.3.csv b/server/src/main/resources/transport/upper_bounds/9.3.csv index 47728be741b74..f91c1311b5025 100644 --- a/server/src/main/resources/transport/upper_bounds/9.3.csv +++ b/server/src/main/resources/transport/upper_bounds/9.3.csv @@ -1 +1 @@ -esql_resolve_fields_response_removed_min_tv,9203000 +shard_routing_includes_became_undesired,9204000 diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java index 61eb50730366f..e6bf938e52787 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java @@ -376,7 +376,10 @@ public void testEqualsIgnoringVersion() { otherRouting.currentNodeId(), otherRouting.primary() == false, otherRouting.state() - ).withRelocatingNodeId(otherRouting.relocatingNodeId()).withUnassignedInfo(otherRouting.unassignedInfo()).build(); + ).withRelocatingNodeId(otherRouting.relocatingNodeId()) + .withUnassignedInfo(otherRouting.unassignedInfo()) + .withBecameUndesired(otherRouting.becameUndesiredTime()) + .build(); break; case 6: // change state @@ -396,6 +399,7 @@ public void testEqualsIgnoringVersion() { ) : null ) + .withBecameUndesired(otherRouting.becameUndesiredTime()) .build(); break; } @@ -414,6 +418,7 @@ public void testEqualsIgnoringVersion() { ? new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "test") : new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, otherRouting.unassignedInfo().message() + "_1") ) + .withBecameUndesired(otherRouting.becameUndesiredTime()) .build(); } From a8fba5265b6a1469cd4bbd42df0701858c6b16fc Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 29 Oct 2025 16:12:58 +1100 Subject: [PATCH 22/44] Add unit test --- .../allocator/DesiredBalanceReconciler.java | 2 +- .../DesiredBalanceReconcilerTests.java | 170 ++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index de15beec6eb06..7faf469bad3b0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -680,7 +680,7 @@ private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue undes logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration); for (final var nodeId : assignment.nodeIds()) { final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); - logger.warn("Shard [{}] cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); + logger.warn("Shard {} cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); } } finally { allocation.setDebugMode(originalDebugMode); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index dcd18001d11a9..cbd253c9f763b 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -1408,6 +1408,176 @@ public void testShouldLogOnTooManyUndesiredAllocations() { ); } + public void testShouldLogOnPersistentUndesiredAllocations() { + + final int shardCount = randomIntBetween(5, 8); + + final var allShardsDesiredOnDataNode1 = Maps.newMapWithExpectedSize(shardCount); + final var allShardsDesiredOnDataNode2 = Maps.newMapWithExpectedSize(shardCount); + + final var metadataBuilder = Metadata.builder(); + final var routingTableBuilder = RoutingTable.builder(); + for (int i = 0; i < shardCount; i++) { + final var indexMetadata = IndexMetadata.builder("index-" + i).settings(indexSettings(IndexVersion.current(), 1, 0)).build(); + final var index = indexMetadata.getIndex(); + final var shardId = new ShardId(index, 0); + metadataBuilder.put(indexMetadata, false); + routingTableBuilder.add(IndexRoutingTable.builder(index).addShard(newShardRouting(shardId, "data-node-1", true, STARTED))); + + allShardsDesiredOnDataNode1.put(shardId, new ShardAssignment(Set.of("data-node-1"), 1, 0, 0)); + allShardsDesiredOnDataNode2.put(shardId, new ShardAssignment(Set.of("data-node-2"), 1, 0, 0)); + } + + final var shardToPreventMovement = "index-" + randomIntBetween(0, shardCount - 1); + + // Prevent allocation of a specific shard node 2 + final var preventAllocationOnNode2Decider = new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return node.nodeId().equals("data-node-2") && shardToPreventMovement.equals(shardRouting.index().getName()) + ? Decision.single(Decision.Type.NO, "no_decider", "Blocks allocation on node 2") + : Decision.YES; + } + }; + // Just to illustrate that yes decisions are excluded from the summary + final var yesDecider = new AllocationDecider() { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return Decision.single(Decision.Type.YES, "yes_decider", "This should not be included in the summary"); + } + }; + + final var initialClusterState = ClusterState.builder(ClusterName.DEFAULT) + .nodes(DiscoveryNodes.builder().add(newNode("data-node-1")).add(newNode("data-node-2"))) + .metadata(metadataBuilder) + .routingTable(routingTableBuilder) + .build(); + + final var undesiredAllocationThreshold = TimeValue.timeValueMinutes(randomIntBetween(10, 50)); + final var clusterSettings = createBuiltInClusterSettings( + Settings.builder() + .put(DesiredBalanceReconciler.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING.getKey(), undesiredAllocationThreshold) + .build() + ); + final var timeProvider = new AdvancingTimeProvider(); + final var reconciler = new DesiredBalanceReconciler(clusterSettings, timeProvider); + + final var currentStateHolder = new AtomicReference(); + + final var shardInUndesiredAllocationMessage = "Shard [" + shardToPreventMovement + "][0] has been in an undesired allocation for *"; + + // Desired assignment matches current routing table, should not log + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + initialClusterState, + new DesiredBalance(1, allShardsDesiredOnDataNode1), + preventAllocationOnNode2Decider + ) + ), + DesiredBalanceReconciler.class, + new MockLog.UnseenEventExpectation( + "Should not log if all shards on desired location", + DesiredBalanceReconciler.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ) + ); + + // Shards are first identified as being in undesired allocations + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + currentStateHolder.get(), + new DesiredBalance(1, allShardsDesiredOnDataNode2), + preventAllocationOnNode2Decider, + yesDecider + ) + ), + DesiredBalanceReconciler.class, + new MockLog.UnseenEventExpectation( + "Should not log because we haven't passed the threshold yet", + DesiredBalanceReconciler.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ) + ); + + // Advance past the logging threshold + timeProvider.advanceByMillis(randomLongBetween(undesiredAllocationThreshold.millis(), undesiredAllocationThreshold.millis() * 2)); + + // Now it should log + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + currentStateHolder.get(), + new DesiredBalance(1, allShardsDesiredOnDataNode2), + preventAllocationOnNode2Decider, + yesDecider + ) + ), + DesiredBalanceReconciler.class, + new MockLog.SeenEventExpectation( + "Should log because this is the first reconciliation after the threshold is exceeded", + DesiredBalanceReconciler.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ), + new MockLog.SeenEventExpectation( + "Should log the NO decisions", + DesiredBalanceReconciler.class.getCanonicalName(), + Level.WARN, + "[" + shardToPreventMovement + "][0] cannot be allocated on node [data-node-2]: [NO(Blocks allocation on node 2)]" + ) + ); + + // The rate limiter should prevent it logging again + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + currentStateHolder.get(), + new DesiredBalance(1, allShardsDesiredOnDataNode2), + preventAllocationOnNode2Decider, + yesDecider + ) + ), + DesiredBalanceReconciler.class, + new MockLog.UnseenEventExpectation( + "Should not log because the rate limiter should prevent it", + DesiredBalanceReconciler.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ) + ); + } + + /** + * Run reconciler, complete any shard movements, then return the resulting cluster state + */ + private ClusterState reconcileAndBuildNewState( + DesiredBalanceReconciler desiredBalanceReconciler, + ClusterState clusterState, + DesiredBalance balance, + AllocationDecider... allocationDeciders + ) { + final RoutingAllocation routingAllocation = createRoutingAllocationFrom(clusterState, allocationDeciders); + desiredBalanceReconciler.reconcile(balance, routingAllocation); + // start all initializing shards + routingAllocation.routingNodes().forEach(routingNode -> routingNode.forEach(shardRouting -> { + if (shardRouting.initializing()) { + routingAllocation.routingNodes().startShard(shardRouting, routingAllocation.changes(), 0L); + } + })); + return ClusterState.builder(clusterState) + .routingTable(clusterState.globalRoutingTable().rebuild(routingAllocation.routingNodes(), routingAllocation.metadata())) + .incrementVersion() + .build(); + } + private static void reconcile(RoutingAllocation routingAllocation, DesiredBalance desiredBalance) { reconcile(routingAllocation, desiredBalance, ALLOCATION_STATS_PLACEHOLDER); } From fc9c7d0b1c3970c032b518e99195df80f501615d Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 16:37:53 +1100 Subject: [PATCH 23/44] Track undesired allocations locally --- .../cluster/routing/RoutingNodes.java | 28 --- .../cluster/routing/ShardRouting.java | 112 ++------- .../allocator/DesiredBalanceReconciler.java | 225 ++++++++---------- .../UndesiredAllocationsTracker.java | 182 ++++++++++++++ .../common/settings/ClusterSettings.java | 4 +- ...hard_routing_includes_became_undesired.csv | 1 - .../resources/transport/upper_bounds/9.3.csv | 2 +- .../routing/IndexRoutingTableTests.java | 3 +- .../cluster/routing/ShardRoutingTests.java | 37 +-- .../DesiredBalanceReconcilerTests.java | 20 +- .../cluster/routing/ShardRoutingHelper.java | 6 +- .../cluster/routing/TestShardRouting.java | 18 +- 12 files changed, 324 insertions(+), 314 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java delete mode 100644 server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java index fc6bc7c3652fd..12e10fc5ae04e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java @@ -892,34 +892,6 @@ public Map> getAssignedShards() { return Collections.unmodifiableMap(assignedShards); } - /** - * Record that this shard is in an undesired location if it's not already marked as such - * - * @return The updated {@link ShardRouting} or this if no change was made - */ - public ShardRouting markAsUndesired(ShardRouting originalShard, long becameUndesiredTime) { - assert originalShard.started() : "Only started shards can be marked as being in an undesired allocation"; - final var withUndesiredInfo = originalShard.markAsUndesired(becameUndesiredTime); - if (withUndesiredInfo != originalShard) { - updateAssigned(originalShard, withUndesiredInfo); - return withUndesiredInfo; - } - return originalShard; - } - - /** - * Clear any undesired allocation metadata from the specified shard - * - * @param originalShard The {@link ShardRouting} to clear undesired metadata from - */ - public void clearUndesired(ShardRouting originalShard) { - assert originalShard.started() : "Only started shards can be marked as being in a desired allocation"; - final var withoutUndesiredInfo = originalShard.clearUndesired(); - if (withoutUndesiredInfo != originalShard) { - updateAssigned(originalShard, withoutUndesiredInfo); - } - } - @Override public boolean equals(Object o) { if (this == o) { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java index 28f1b38b90c92..2ce349e2d3b61 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java @@ -42,12 +42,8 @@ public final class ShardRouting implements Writeable, ToXContentObject { * Used if shard size is not available */ public static final long UNAVAILABLE_EXPECTED_SHARD_SIZE = -1; - public static final long NOT_UNDESIRED_TIMESTAMP = Long.MAX_VALUE; private static final TransportVersion EXPECTED_SHARD_SIZE_FOR_STARTED_VERSION = TransportVersions.V_8_5_0; private static final TransportVersion RELOCATION_FAILURE_INFO_VERSION = TransportVersions.V_8_6_0; - private static final TransportVersion BECAME_UNDESIRED_TIME_VERSION = TransportVersion.fromName( - "shard_routing_includes_became_undesired" - ); private final ShardId shardId; private final String currentNodeId; @@ -71,7 +67,6 @@ public final class ShardRouting implements Writeable, ToXContentObject { @Nullable private final ShardRouting targetRelocatingShard; private final Role role; - private final long becameUndesiredTime; /** * A constructor to internally create shard routing instances, note, the internal flag should only be set to true @@ -88,8 +83,7 @@ public final class ShardRouting implements Writeable, ToXContentObject { RelocationFailureInfo relocationFailureInfo, AllocationId allocationId, long expectedShardSize, - Role role, - long becameUndesiredTime + Role role ) { this.shardId = shardId; this.currentNodeId = currentNodeId; @@ -103,7 +97,6 @@ public final class ShardRouting implements Writeable, ToXContentObject { this.expectedShardSize = expectedShardSize; this.role = role; this.targetRelocatingShard = initializeTargetRelocatingShard(); - this.becameUndesiredTime = becameUndesiredTime; assert assertConsistent(); } @@ -157,8 +150,7 @@ private ShardRouting initializeTargetRelocatingShard() { RelocationFailureInfo.NO_FAILURES, AllocationId.newTargetRelocation(allocationId), expectedShardSize, - role, - NOT_UNDESIRED_TIMESTAMP // Assume we didn't relocate to another undesired allocation (?) + role ); } else { return null; @@ -186,8 +178,7 @@ public static ShardRouting newUnassigned( RelocationFailureInfo.NO_FAILURES, null, UNAVAILABLE_EXPECTED_SHARD_SIZE, - role, - NOT_UNDESIRED_TIMESTAMP + role ); } @@ -371,11 +362,6 @@ public ShardRouting(ShardId shardId, StreamInput in) throws IOException { role = Role.DEFAULT; } targetRelocatingShard = initializeTargetRelocatingShard(); - if (in.getTransportVersion().supports(BECAME_UNDESIRED_TIME_VERSION)) { - becameUndesiredTime = in.readLong(); - } else { - becameUndesiredTime = NOT_UNDESIRED_TIMESTAMP; - } } public ShardRouting(StreamInput in) throws IOException { @@ -414,10 +400,6 @@ public void writeToThin(StreamOutput out) throws IOException { Strings.format("cannot send role [%s] to node with version [%s]", role, out.getTransportVersion().toReleaseVersion()) ); } - - if (out.getTransportVersion().supports(BECAME_UNDESIRED_TIME_VERSION)) { - out.writeLong(becameUndesiredTime); - } } @Override @@ -440,59 +422,10 @@ public ShardRouting updateUnassigned(UnassignedInfo unassignedInfo, RecoverySour relocationFailureInfo, allocationId, expectedShardSize, - role, - becameUndesiredTime - ); - } - - public ShardRouting markAsUndesired(long becameUndesiredTime) { - if (this.becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP) { - return this; - } - return new ShardRouting( - shardId, - currentNodeId, - relocatingNodeId, - primary, - state, - recoverySource, - unassignedInfo, - relocationFailureInfo, - allocationId, - expectedShardSize, - role, - becameUndesiredTime + role ); } - public ShardRouting clearUndesired() { - if (becameUndesiredTime == NOT_UNDESIRED_TIMESTAMP) { - return this; - } - return new ShardRouting( - shardId, - currentNodeId, - relocatingNodeId, - primary, - state, - recoverySource, - unassignedInfo, - relocationFailureInfo, - allocationId, - expectedShardSize, - role, - NOT_UNDESIRED_TIMESTAMP - ); - } - - public boolean isInUndesiredAllocation() { - return becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP; - } - - public long becameUndesiredTime() { - return becameUndesiredTime; - } - public ShardRouting updateRelocationFailure(RelocationFailureInfo relocationFailureInfo) { assert this.relocationFailureInfo != null : "can only update relocation failure info info if it is already set"; return new ShardRouting( @@ -506,8 +439,7 @@ public ShardRouting updateRelocationFailure(RelocationFailureInfo relocationFail relocationFailureInfo, allocationId, expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -537,8 +469,7 @@ public ShardRouting moveToUnassigned(UnassignedInfo unassignedInfo) { RelocationFailureInfo.NO_FAILURES, null, UNAVAILABLE_EXPECTED_SHARD_SIZE, - role, - NOT_UNDESIRED_TIMESTAMP // Unassigned and undesired are tracked separately + role ); } @@ -567,8 +498,7 @@ public ShardRouting initialize(String nodeId, @Nullable String existingAllocatio RelocationFailureInfo.NO_FAILURES, allocationId, expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -590,8 +520,7 @@ public ShardRouting relocate(String relocatingNodeId, long expectedShardSize) { relocationFailureInfo, AllocationId.newRelocation(allocationId), expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -614,8 +543,7 @@ public ShardRouting cancelRelocation() { relocationFailureInfo.incFailedRelocations(), AllocationId.cancelRelocation(allocationId), UNAVAILABLE_EXPECTED_SHARD_SIZE, - role, - becameUndesiredTime + role ); } @@ -640,8 +568,7 @@ public ShardRouting removeRelocationSource() { relocationFailureInfo, AllocationId.finishRelocation(allocationId), expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -663,8 +590,7 @@ public ShardRouting reinitializeReplicaShard() { relocationFailureInfo, AllocationId.newInitializing(), expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -692,8 +618,7 @@ public ShardRouting moveToStarted(long expectedShardSize) { RelocationFailureInfo.NO_FAILURES, allocationId, expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -718,8 +643,7 @@ public ShardRouting moveActiveReplicaToPrimary() { relocationFailureInfo, allocationId, expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -744,8 +668,7 @@ public ShardRouting moveUnassignedFromPrimary() { relocationFailureInfo, allocationId, expectedShardSize, - role, - becameUndesiredTime + role ); } @@ -889,8 +812,7 @@ public boolean equals(Object o) { ShardRouting that = (ShardRouting) o; return equalsIgnoringMetadata(that) && Objects.equals(unassignedInfo, that.unassignedInfo) - && Objects.equals(relocationFailureInfo, that.relocationFailureInfo) - && becameUndesiredTime == that.becameUndesiredTime; + && Objects.equals(relocationFailureInfo, that.relocationFailureInfo); } /** @@ -912,7 +834,6 @@ public int hashCode() { h = 31 * h + (allocationId != null ? allocationId.hashCode() : 0); h = 31 * h + (unassignedInfo != null ? unassignedInfo.hashCode() : 0); h = 31 * h + (relocationFailureInfo != null ? relocationFailureInfo.hashCode() : 0); - h = 31 * h + Long.hashCode(becameUndesiredTime); h = 31 * h + role.hashCode(); hashCode = h; } @@ -956,9 +877,6 @@ public String shortSummary() { if (expectedShardSize != UNAVAILABLE_EXPECTED_SHARD_SIZE) { sb.append(", expected_shard_size[").append(expectedShardSize).append("]"); } - if (becameUndesiredTime != NOT_UNDESIRED_TIMESTAMP) { - sb.append(", became_undesired_time[").append(becameUndesiredTime).append("]"); - } return sb.toString(); } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 7faf469bad3b0..3a2ca272e151e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -79,43 +79,26 @@ public class DesiredBalanceReconciler { Setting.Property.NodeScope ); - /** - * Warning logs will be periodically written if we see a shard that's been in an undesired allocation for this long - */ - public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING = Setting.timeSetting( - "cluster.routing.allocation.desired_balance.undesired_duration_logging.threshold", - TimeValue.timeValueMinutes(5), - Setting.Property.Dynamic, - Setting.Property.NodeScope - ); - - private final TimeProvider timeProvider; private final FrequencyCappedAction undesiredAllocationPercentageLogInterval; - private final FrequencyCappedAction undesiredAllocationDurationLogInterval; private double undesiredAllocationsPercentageLoggingThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); - private volatile TimeValue undesiredAllocationDurationLoggingThreshold; + private final UndesiredAllocationsTracker undesiredAllocationsTracker; public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { - this.timeProvider = timeProvider; this.undesiredAllocationPercentageLogInterval = new FrequencyCappedAction( timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5) ); - this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); - clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, value -> { - this.undesiredAllocationPercentageLogInterval.setMinInterval(value); - this.undesiredAllocationDurationLogInterval.setMinInterval(value); - }); clusterSettings.initializeAndWatch( - UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - value -> this.undesiredAllocationsPercentageLoggingThreshold = value + UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, + this.undesiredAllocationPercentageLogInterval::setMinInterval ); clusterSettings.initializeAndWatch( - UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, - value -> this.undesiredAllocationDurationLoggingThreshold = value + UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, + value -> this.undesiredAllocationsPercentageLoggingThreshold = value ); + this.undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, timeProvider, 50); } /** @@ -136,6 +119,7 @@ public DesiredBalanceMetrics.AllocationStats reconcile(DesiredBalance desiredBal public void clear() { allocationOrdering.clear(); moveOrdering.clear(); + undesiredAllocationsTracker.clear(); } /** @@ -156,6 +140,7 @@ private class Reconciliation { } DesiredBalanceMetrics.AllocationStats run() { + undesiredAllocationsTracker.cleanup(routingNodes); try (var ignored = allocation.withReconcilingFlag()) { logger.debug("Reconciling desired balance for [{}]", desiredBalance.lastConvergedIndex()); @@ -515,37 +500,51 @@ private void moveShards() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node - routingNodes.clearUndesired(shardRouting); + undesiredAllocationsTracker.clear(shardRouting); continue; } - shardRouting = routingNodes.markAsUndesired(shardRouting, timeProvider.relativeTimeInMillis()); - if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { - // cannot allocate anywhere, no point in looking for a target node - continue; - } + boolean movedUndesiredShard = false; + try { + if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { + // cannot allocate anywhere, no point in looking for a target node + continue; + } - final var routingNode = routingNodes.node(shardRouting.currentNodeId()); - final var canRemainDecision = allocation.deciders().canRemain(shardRouting, routingNode, allocation); - if (canRemainDecision.type() != Decision.Type.NO && canRemainDecision.type() != Decision.Type.NOT_PREFERRED) { - // If movement is throttled, a future reconciliation round will see a resolution. For now, leave it alone. - // Reconciliation treats canRemain NOT_PREFERRED answers as YES because the DesiredBalance computation already decided - // how to handle the situation. - continue; - } + final var routingNode = routingNodes.node(shardRouting.currentNodeId()); + final var canRemainDecision = allocation.deciders().canRemain(shardRouting, routingNode, allocation); + if (canRemainDecision.type() != Decision.Type.NO && canRemainDecision.type() != Decision.Type.NOT_PREFERRED) { + // If movement is throttled, a future reconciliation round will see a resolution. For now, leave it alone. + // Reconciliation treats canRemain NOT_PREFERRED answers as YES because the DesiredBalance computation already + // decided how to handle the situation. + continue; + } - final var moveTarget = findRelocationTarget(shardRouting, assignment.nodeIds()); - if (moveTarget != null) { - logger.debug("Moving shard {} from {} to {}", shardRouting.shardId(), shardRouting.currentNodeId(), moveTarget.getId()); - routingNodes.relocateShard( - shardRouting, - moveTarget.getId(), - allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), - "move", - allocation.changes() - ); - iterator.dePrioritizeNode(shardRouting.currentNodeId()); - moveOrdering.recordAllocation(shardRouting.currentNodeId()); + final var moveTarget = findRelocationTarget(shardRouting, assignment.nodeIds()); + if (moveTarget != null) { + logger.debug( + "Moving shard {} from {} to {}", + shardRouting.shardId(), + shardRouting.currentNodeId(), + moveTarget.getId() + ); + routingNodes.relocateShard( + shardRouting, + moveTarget.getId(), + allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), + "move", + allocation.changes() + ); + iterator.dePrioritizeNode(shardRouting.currentNodeId()); + moveOrdering.recordAllocation(shardRouting.currentNodeId()); + movedUndesiredShard = true; + } + } finally { + if (movedUndesiredShard) { + undesiredAllocationsTracker.clear(shardRouting); + } else { + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + } } } } @@ -580,56 +579,63 @@ private DesiredBalanceMetrics.AllocationStats balance() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node - routingNodes.clearUndesired(shardRouting); + undesiredAllocationsTracker.clear(shardRouting); continue; } - shardRouting = routingNodes.markAsUndesired(shardRouting, timeProvider.relativeTimeInMillis()); - assert shardRouting.becameUndesiredTime() != ShardRouting.NOT_UNDESIRED_TIMESTAMP; - earliestUndesiredTimestamp = Math.min(earliestUndesiredTimestamp, shardRouting.becameUndesiredTime()); - - if (allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId()) == false) { - // shard is not on a shutting down node, nor is it on a desired node per the previous check. - undesiredAllocationsExcludingShuttingDownNodes++; - undesiredAllocationsExcludingShuttingDownNodesByRole.addTo(shardRouting.role(), 1); - } - if (allocation.deciders().canRebalance(allocation).type() != Decision.Type.YES) { - // Rebalancing is disabled, we're just here to collect the AllocationStats to return. - continue; - } + boolean movedUndesiredShard = false; + try { + if (allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId()) == false) { + // shard is not on a shutting down node, nor is it on a desired node per the previous check. + undesiredAllocationsExcludingShuttingDownNodes++; + undesiredAllocationsExcludingShuttingDownNodesByRole.addTo(shardRouting.role(), 1); + } - if (allocation.deciders().canRebalance(shardRouting, allocation).type() != Decision.Type.YES) { - // rebalancing disabled for this shard - continue; - } + if (allocation.deciders().canRebalance(allocation).type() != Decision.Type.YES) { + // Rebalancing is disabled, we're just here to collect the AllocationStats to return. + continue; + } - if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { - // cannot allocate anywhere, no point in looking for a target node - continue; - } + if (allocation.deciders().canRebalance(shardRouting, allocation).type() != Decision.Type.YES) { + // rebalancing disabled for this shard + continue; + } - final var rebalanceTarget = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); - if (rebalanceTarget != null) { - logger.debug( - "Rebalancing shard {} from {} to {}", - shardRouting.shardId(), - shardRouting.currentNodeId(), - rebalanceTarget.getId() - ); + if (allocation.deciders().canAllocate(shardRouting, allocation).type() != Decision.Type.YES) { + // cannot allocate anywhere, no point in looking for a target node + continue; + } - routingNodes.relocateShard( - shardRouting, - rebalanceTarget.getId(), - allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), - "rebalance", - allocation.changes() - ); - iterator.dePrioritizeNode(shardRouting.currentNodeId()); - moveOrdering.recordAllocation(shardRouting.currentNodeId()); + final var rebalanceTarget = findRelocationTarget(shardRouting, assignment.nodeIds(), this::decideCanAllocate); + if (rebalanceTarget != null) { + logger.debug( + "Rebalancing shard {} from {} to {}", + shardRouting.shardId(), + shardRouting.currentNodeId(), + rebalanceTarget.getId() + ); + + routingNodes.relocateShard( + shardRouting, + rebalanceTarget.getId(), + allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), + "rebalance", + allocation.changes() + ); + iterator.dePrioritizeNode(shardRouting.currentNodeId()); + moveOrdering.recordAllocation(shardRouting.currentNodeId()); + movedUndesiredShard = true; + } + } finally { + if (movedUndesiredShard) { + undesiredAllocationsTracker.clear(shardRouting); + } else { + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + } } } - maybeLogUndesiredShardsWarning(earliestUndesiredTimestamp); + undesiredAllocationsTracker.maybeLogUndesiredShardsWarning(routingNodes, allocation, desiredBalance); maybeLogUndesiredAllocationsWarning(totalAllocations, undesiredAllocationsExcludingShuttingDownNodes, routingNodes.size()); return new DesiredBalanceMetrics.AllocationStats( unassignedShards, @@ -646,47 +652,6 @@ private DesiredBalanceMetrics.AllocationStats balance() { ); } - /** - * If there are shards that have been in undesired allocations for longer than the configured - * threshold, log a warning - */ - private void maybeLogUndesiredShardsWarning(long earliestUndesiredTimestamp) { - final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - if (earliestUndesiredTimestamp < currentTimeMillis - && currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { - undesiredAllocationDurationLogInterval.maybeExecute(this::logDecisionsForUndesiredShardsOverThreshold); - } - } - - private void logDecisionsForUndesiredShardsOverThreshold() { - final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - routingNodes.forEach(routingNode -> { - routingNode.forEach(shardRouting -> { - if (shardRouting.isInUndesiredAllocation()) { - final long undesiredDurationMs = currentTimeMillis - shardRouting.becameUndesiredTime(); - if (undesiredDurationMs > undesiredAllocationDurationLoggingThreshold.millis()) { - logUndesiredShardDetails(shardRouting, TimeValue.timeValueMillis(undesiredDurationMs)); - } - } - }); - }); - } - - private void logUndesiredShardDetails(ShardRouting shardRouting, TimeValue undesiredDuration) { - final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); - allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); - try { - final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); - logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration); - for (final var nodeId : assignment.nodeIds()) { - final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); - logger.warn("Shard {} cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); - } - } finally { - allocation.setDebugMode(originalDebugMode); - } - } - private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java new file mode 100644 index 0000000000000..fd420b8ebbf24 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -0,0 +1,182 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import com.carrotsearch.hppc.ObjectLongHashMap; + +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.common.FrequencyCappedAction; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.time.TimeProvider; +import org.elasticsearch.core.TimeValue; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; + +/** + * Keeps track of a limited number of shards that are currently in undesired allocations. If the + * shards remain in undesired allocations for longer than a configurable threshold, it will log + * why those shards can't be allocated to desired nodes. + */ +public class UndesiredAllocationsTracker { + + private static final Logger logger = LogManager.getLogger(UndesiredAllocationsTracker.class); + + /** + * Warning logs will be periodically written if we see a shard that's been in an undesired allocation for this long + */ + public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING = Setting.timeSetting( + "cluster.routing.allocation.desired_balance.undesired_duration_logging.threshold", + TimeValue.timeValueMinutes(5), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + + /** + * The max number of undesired allocations to track. We expect this to be relatively small. + */ + public static final Setting MAX_UNDESIRED_ALLOCATIONS_TO_TRACK = Setting.intSetting( + "cluster.routing.allocation.desired_balance.undesired_duration_logging.max_to_track", + 50, + 0, + 200, + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + + private final TimeProvider timeProvider; + private final ObjectLongHashMap undesiredAllocations = new ObjectLongHashMap<>(); + private final FrequencyCappedAction undesiredAllocationDurationLogInterval; + private volatile TimeValue undesiredAllocationDurationLoggingThreshold; + private volatile int maxUndesiredAllocationsToTrack; + + public UndesiredAllocationsTracker(ClusterSettings clusterSettings, TimeProvider timeProvider, int maxUndesiredAllocationsToTrack) { + this.timeProvider = timeProvider; + this.maxUndesiredAllocationsToTrack = maxUndesiredAllocationsToTrack; + this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); + clusterSettings.initializeAndWatch( + DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, + undesiredAllocationDurationLogInterval::setMinInterval + ); + clusterSettings.initializeAndWatch( + UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, + value -> undesiredAllocationDurationLoggingThreshold = value + ); + clusterSettings.initializeAndWatch(MAX_UNDESIRED_ALLOCATIONS_TO_TRACK, value -> this.maxUndesiredAllocationsToTrack = value); + } + + /** + * Track an allocation as being undesired + */ + public void trackUndesiredAllocation(ShardRouting shardRouting) { + if (undesiredAllocations.containsKey(shardRouting) == false) { + assert shardRouting.unassigned() == false : "Shouldn't record unassigned shards as undesired allocations"; + if (undesiredAllocations.size() < maxUndesiredAllocationsToTrack) { + undesiredAllocations.put(shardRouting, timeProvider.relativeTimeInMillis()); + } + } + } + + /** + * Clear any tracking of the provided allocation + */ + public void clear(ShardRouting shardRouting) { + undesiredAllocations.remove(shardRouting); + } + + /** + * Clear any {@link ShardRouting} that are no longer present in the routing nodes + */ + public void cleanup(RoutingNodes routingNodes) { + undesiredAllocations.removeAll(shardRouting -> { + final var allocationId = shardRouting.allocationId(); + if (allocationId != null) { + return routingNodes.getByAllocationId(shardRouting.shardId(), allocationId.getId()) == null; + } else { + assert false : "Unassigned shards shouldn't be marked as undesired"; + return true; + } + }); + } + + /** + * Clear all tracked allocations + */ + public void clear() { + undesiredAllocations.clear(); + } + + /** + * If there are shards that have been in undesired allocations for longer than the configured + * threshold, log a warning + */ + public void maybeLogUndesiredShardsWarning( + RoutingNodes routingNodes, + RoutingAllocation routingAllocation, + DesiredBalance desiredBalance + ) { + final long currentTimeMillis = timeProvider.relativeTimeInMillis(); + long earliestUndesiredTimestamp = Long.MAX_VALUE; + for (var allocation : undesiredAllocations) { + if (allocation.value < earliestUndesiredTimestamp) { + earliestUndesiredTimestamp = allocation.value; + } + } + if (earliestUndesiredTimestamp < currentTimeMillis + && currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { + undesiredAllocationDurationLogInterval.maybeExecute( + () -> logDecisionsForUndesiredShardsOverThreshold(routingNodes, routingAllocation, desiredBalance) + ); + } + } + + private void logDecisionsForUndesiredShardsOverThreshold( + RoutingNodes routingNodes, + RoutingAllocation routingAllocation, + DesiredBalance desiredBalance + ) { + final long currentTimeMillis = timeProvider.relativeTimeInMillis(); + final long loggingThresholdTimestamp = currentTimeMillis - undesiredAllocationDurationLoggingThreshold.millis(); + for (var allocation : undesiredAllocations) { + if (allocation.value < loggingThresholdTimestamp) { + logUndesiredShardDetails( + allocation.key, + TimeValue.timeValueMillis(currentTimeMillis - allocation.value), + routingNodes, + routingAllocation, + desiredBalance + ); + } + } + } + + private void logUndesiredShardDetails( + ShardRouting shardRouting, + TimeValue undesiredDuration, + RoutingNodes routingNodes, + RoutingAllocation allocation, + DesiredBalance desiredBalance + ) { + final RoutingAllocation.DebugMode originalDebugMode = allocation.getDebugMode(); + allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS); + try { + final var assignment = desiredBalance.getAssignment(shardRouting.shardId()); + logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration); + for (final var nodeId : assignment.nodeIds()) { + final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); + logger.warn("Shard {} cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); + } + } finally { + allocation.setDebugMode(originalDebugMode); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 25880cbbfa453..c90eae104f146 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -51,6 +51,7 @@ import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceComputer; import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceReconciler; +import org.elasticsearch.cluster.routing.allocation.allocator.UndesiredAllocationsTracker; import org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider; @@ -235,7 +236,8 @@ public void apply(Settings value, Settings current, Settings previous) { DesiredBalanceComputer.MAX_BALANCE_COMPUTATION_TIME_DURING_INDEX_CREATION_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - DesiredBalanceReconciler.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, + UndesiredAllocationsTracker.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, + UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, BreakerSettings.CIRCUIT_BREAKER_TYPE, diff --git a/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv b/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv deleted file mode 100644 index a79975c00dea2..0000000000000 --- a/server/src/main/resources/transport/definitions/referable/shard_routing_includes_became_undesired.csv +++ /dev/null @@ -1 +0,0 @@ -9204000 diff --git a/server/src/main/resources/transport/upper_bounds/9.3.csv b/server/src/main/resources/transport/upper_bounds/9.3.csv index f91c1311b5025..47728be741b74 100644 --- a/server/src/main/resources/transport/upper_bounds/9.3.csv +++ b/server/src/main/resources/transport/upper_bounds/9.3.csv @@ -1 +1 @@ -shard_routing_includes_became_undesired,9204000 +esql_resolve_fields_response_removed_min_tv,9203000 diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java index dcf5ae461be99..37d23cc855ec4 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/IndexRoutingTableTests.java @@ -146,8 +146,7 @@ private ShardRouting getShard(ShardId shardId, boolean isPrimary, ShardRoutingSt TestShardRouting.buildRelocationFailureInfo(state), TestShardRouting.buildAllocationId(state), randomLongBetween(-1, 1024), - role, - randomBoolean() ? ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE : randomLong() + role ); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java index e6bf938e52787..23fdb47619ef8 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/ShardRoutingTests.java @@ -51,8 +51,7 @@ protected ShardRouting createTestInstance() { TestShardRouting.buildRelocationFailureInfo(state), TestShardRouting.buildAllocationId(state), randomLongBetween(-1, 1024), - randomFrom(ShardRouting.Role.DEFAULT, (primary ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY)), - randomBoolean() ? ShardRouting.NOT_UNDESIRED_TIMESTAMP : randomLong() + randomFrom(ShardRouting.Role.DEFAULT, (primary ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY)) ); } @@ -84,8 +83,7 @@ private static ShardRouting mutateShardId(ShardRouting instance) { instance.relocationFailureInfo(), instance.allocationId(), instance.getExpectedShardSize(), - instance.role(), - instance.becameUndesiredTime() + instance.role() ); } @@ -117,8 +115,7 @@ private static ShardRouting mutateState(ShardRouting instance) { ); }, instance.getExpectedShardSize(), - instance.role(), - instance.becameUndesiredTime() + instance.role() ); } @@ -134,8 +131,7 @@ private static ShardRouting mutateCurrentNodeId(ShardRouting instance) { instance.relocationFailureInfo(), instance.allocationId(), instance.getExpectedShardSize(), - instance.role(), - instance.becameUndesiredTime() + instance.role() ); } @@ -157,8 +153,7 @@ private static ShardRouting mutateRole(ShardRouting instance) { ShardRouting.Role.DEFAULT, (instance.primary() ? ShardRouting.Role.INDEX_ONLY : ShardRouting.Role.SEARCH_ONLY) ) - ), - instance.becameUndesiredTime() + ) ); } @@ -279,8 +274,7 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role(), - otherRouting.becameUndesiredTime() + otherRouting.role() ); break; case 1: @@ -296,8 +290,7 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role(), - otherRouting.becameUndesiredTime() + otherRouting.role() ); break; case 2: @@ -316,8 +309,7 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role(), - otherRouting.becameUndesiredTime() + otherRouting.role() ); } break; @@ -337,8 +329,7 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role(), - otherRouting.becameUndesiredTime() + otherRouting.role() ); } break; @@ -363,8 +354,7 @@ public void testEqualsIgnoringVersion() { otherRouting.relocationFailureInfo(), otherRouting.allocationId(), otherRouting.getExpectedShardSize(), - otherRouting.role(), - otherRouting.becameUndesiredTime() + otherRouting.role() ); } break; @@ -376,10 +366,7 @@ public void testEqualsIgnoringVersion() { otherRouting.currentNodeId(), otherRouting.primary() == false, otherRouting.state() - ).withRelocatingNodeId(otherRouting.relocatingNodeId()) - .withUnassignedInfo(otherRouting.unassignedInfo()) - .withBecameUndesired(otherRouting.becameUndesiredTime()) - .build(); + ).withRelocatingNodeId(otherRouting.relocatingNodeId()).withUnassignedInfo(otherRouting.unassignedInfo()).build(); break; case 6: // change state @@ -399,7 +386,6 @@ public void testEqualsIgnoringVersion() { ) : null ) - .withBecameUndesired(otherRouting.becameUndesiredTime()) .build(); break; } @@ -418,7 +404,6 @@ public void testEqualsIgnoringVersion() { ? new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, "test") : new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, otherRouting.unassignedInfo().message() + "_1") ) - .withBecameUndesired(otherRouting.becameUndesiredTime()) .build(); } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index cbd253c9f763b..14923fcc54f6e 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -1456,7 +1456,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing final var undesiredAllocationThreshold = TimeValue.timeValueMinutes(randomIntBetween(10, 50)); final var clusterSettings = createBuiltInClusterSettings( Settings.builder() - .put(DesiredBalanceReconciler.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING.getKey(), undesiredAllocationThreshold) + .put(UndesiredAllocationsTracker.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING.getKey(), undesiredAllocationThreshold) .build() ); final var timeProvider = new AdvancingTimeProvider(); @@ -1476,10 +1476,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing preventAllocationOnNode2Decider ) ), - DesiredBalanceReconciler.class, + UndesiredAllocationsTracker.class, new MockLog.UnseenEventExpectation( "Should not log if all shards on desired location", - DesiredBalanceReconciler.class.getCanonicalName(), + UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, shardInUndesiredAllocationMessage ) @@ -1496,10 +1496,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing yesDecider ) ), - DesiredBalanceReconciler.class, + UndesiredAllocationsTracker.class, new MockLog.UnseenEventExpectation( "Should not log because we haven't passed the threshold yet", - DesiredBalanceReconciler.class.getCanonicalName(), + UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, shardInUndesiredAllocationMessage ) @@ -1519,16 +1519,16 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing yesDecider ) ), - DesiredBalanceReconciler.class, + UndesiredAllocationsTracker.class, new MockLog.SeenEventExpectation( "Should log because this is the first reconciliation after the threshold is exceeded", - DesiredBalanceReconciler.class.getCanonicalName(), + UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, shardInUndesiredAllocationMessage ), new MockLog.SeenEventExpectation( "Should log the NO decisions", - DesiredBalanceReconciler.class.getCanonicalName(), + UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, "[" + shardToPreventMovement + "][0] cannot be allocated on node [data-node-2]: [NO(Blocks allocation on node 2)]" ) @@ -1545,10 +1545,10 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing yesDecider ) ), - DesiredBalanceReconciler.class, + UndesiredAllocationsTracker.class, new MockLog.UnseenEventExpectation( "Should not log because the rate limiter should prevent it", - DesiredBalanceReconciler.class.getCanonicalName(), + UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, shardInUndesiredAllocationMessage ) diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java b/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java index 59161f5c688fd..a536112e93a1f 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/routing/ShardRoutingHelper.java @@ -52,8 +52,7 @@ public static ShardRouting initWithSameId(ShardRouting copy, RecoverySource reco RelocationFailureInfo.NO_FAILURES, copy.allocationId(), copy.getExpectedShardSize(), - copy.role(), - copy.becameUndesiredTime() + copy.role() ); } @@ -73,8 +72,7 @@ public static ShardRouting newWithRestoreSource(ShardRouting routing, SnapshotRe routing.relocationFailureInfo(), routing.allocationId(), routing.getExpectedShardSize(), - routing.role(), - routing.becameUndesiredTime() + routing.role() ); } } diff --git a/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java b/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java index 2dee946f6fa43..84e1dd532e2b7 100644 --- a/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java +++ b/test/framework/src/main/java/org/elasticsearch/cluster/routing/TestShardRouting.java @@ -63,7 +63,6 @@ public static class Builder { private AllocationId allocationId; private Long expectedShardSize; private ShardRouting.Role role; - private Long becameUndesired; public Builder(ShardId shardId, String currentNodeId, boolean primary, ShardRoutingState state) { this.shardId = shardId; @@ -107,11 +106,6 @@ public Builder withRole(ShardRouting.Role role) { return this; } - public Builder withBecameUndesired(long becameUndesired) { - this.becameUndesired = becameUndesired; - return this; - } - public ShardRouting build() { return new ShardRouting( shardId, @@ -124,8 +118,7 @@ public ShardRouting build() { relocationFailureInfo != null ? relocationFailureInfo : buildRelocationFailureInfo(state), allocationId != null ? allocationId : buildAllocationId(state), expectedShardSize != null ? expectedShardSize : ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role != null ? role : ShardRouting.Role.DEFAULT, - becameUndesired != null ? becameUndesired : ShardRouting.NOT_UNDESIRED_TIMESTAMP + role != null ? role : ShardRouting.Role.DEFAULT ); } } @@ -167,8 +160,7 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role, - ShardRouting.NOT_UNDESIRED_TIMESTAMP + role ); } @@ -192,8 +184,7 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - role, - ShardRouting.NOT_UNDESIRED_TIMESTAMP + role ); } @@ -232,8 +223,7 @@ public static ShardRouting newShardRouting( buildRelocationFailureInfo(state), buildAllocationId(state), ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE, - ShardRouting.Role.DEFAULT, - ShardRouting.NOT_UNDESIRED_TIMESTAMP + ShardRouting.Role.DEFAULT ); } From 5f57c0b7d0886905726fa763dbb2dccc9a1721c4 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 16:50:30 +1100 Subject: [PATCH 24/44] Minimise change --- .../allocator/DesiredBalanceReconciler.java | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 3a2ca272e151e..0c2a1994d5690 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -79,24 +79,18 @@ public class DesiredBalanceReconciler { Setting.Property.NodeScope ); - private final FrequencyCappedAction undesiredAllocationPercentageLogInterval; - private double undesiredAllocationsPercentageLoggingThreshold; + private final FrequencyCappedAction undesiredAllocationLogInterval; + private double undesiredAllocationsLogThreshold; private final NodeAllocationOrdering allocationOrdering = new NodeAllocationOrdering(); private final NodeAllocationOrdering moveOrdering = new NodeAllocationOrdering(); private final UndesiredAllocationsTracker undesiredAllocationsTracker; public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider timeProvider) { - this.undesiredAllocationPercentageLogInterval = new FrequencyCappedAction( - timeProvider::relativeTimeInMillis, - TimeValue.timeValueMinutes(5) - ); - clusterSettings.initializeAndWatch( - UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, - this.undesiredAllocationPercentageLogInterval::setMinInterval - ); + this.undesiredAllocationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.timeValueMinutes(5)); + clusterSettings.initializeAndWatch(UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, this.undesiredAllocationLogInterval::setMinInterval); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, - value -> this.undesiredAllocationsPercentageLoggingThreshold = value + value -> this.undesiredAllocationsLogThreshold = value ); this.undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, timeProvider, 50); } @@ -559,7 +553,6 @@ private DesiredBalanceMetrics.AllocationStats balance() { // Iterate over all started shards and try to move any which are on undesired nodes. In the presence of throttling shard // movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the // shards. - var earliestUndesiredTimestamp = Long.MAX_VALUE; for (final var iterator = OrderedShardsIterator.createForBalancing(allocation, moveOrdering); iterator.hasNext();) { var shardRouting = iterator.next(); @@ -655,16 +648,15 @@ private DesiredBalanceMetrics.AllocationStats balance() { private void maybeLogUndesiredAllocationsWarning(int totalAllocations, int undesiredAllocations, int nodeCount) { // more shards than cluster can relocate with one reroute final boolean nonEmptyRelocationBacklog = undesiredAllocations > 2L * nodeCount; - final boolean warningThresholdReached = undesiredAllocations > undesiredAllocationsPercentageLoggingThreshold - * totalAllocations; + final boolean warningThresholdReached = undesiredAllocations > undesiredAllocationsLogThreshold * totalAllocations; if (totalAllocations > 0 && nonEmptyRelocationBacklog && warningThresholdReached) { - undesiredAllocationPercentageLogInterval.maybeExecute( + undesiredAllocationLogInterval.maybeExecute( () -> logger.warn( "[{}] of assigned shards ({}/{}) are not on their desired nodes, which exceeds the warn threshold of [{}]", Strings.format1Decimals(100.0 * undesiredAllocations / totalAllocations, "%"), undesiredAllocations, totalAllocations, - Strings.format1Decimals(100.0 * undesiredAllocationsPercentageLoggingThreshold, "%") + Strings.format1Decimals(100.0 * undesiredAllocationsLogThreshold, "%") ) ); } From 5d1d4c0302aa7bc2ba2956fc2de5fe06d9d75ace Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 16:56:39 +1100 Subject: [PATCH 25/44] Minimise change --- .../allocation/allocator/DesiredBalanceReconciler.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index 0c2a1994d5690..f1d67432773e0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -479,7 +479,7 @@ private void moveShards() { // Iterate over all started shards and check if they can remain. In the presence of throttling shard movements, // the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the shards. for (final var iterator = OrderedShardsIterator.createForNecessaryMoves(allocation, moveOrdering); iterator.hasNext();) { - var shardRouting = iterator.next(); + final var shardRouting = iterator.next(); if (shardRouting.started() == false) { // can only move started shards @@ -554,7 +554,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { // movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are offloading the // shards. for (final var iterator = OrderedShardsIterator.createForBalancing(allocation, moveOrdering); iterator.hasNext();) { - var shardRouting = iterator.next(); + final var shardRouting = iterator.next(); totalAllocations++; totalAllocationsByRole.addTo(shardRouting.role(), 1); From 54533f7bc6b81ca6bdb051e8c8f708e97bd5bd15 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 17:00:38 +1100 Subject: [PATCH 26/44] Tidy --- .../routing/allocation/allocator/DesiredBalanceReconciler.java | 2 +- .../allocation/allocator/UndesiredAllocationsTracker.java | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index f1d67432773e0..a7337afd26fc8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -92,7 +92,7 @@ public DesiredBalanceReconciler(ClusterSettings clusterSettings, TimeProvider ti UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, value -> this.undesiredAllocationsLogThreshold = value ); - this.undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, timeProvider, 50); + this.undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, timeProvider); } /** diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index fd420b8ebbf24..6a28b2ea6810e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -59,9 +59,8 @@ public class UndesiredAllocationsTracker { private volatile TimeValue undesiredAllocationDurationLoggingThreshold; private volatile int maxUndesiredAllocationsToTrack; - public UndesiredAllocationsTracker(ClusterSettings clusterSettings, TimeProvider timeProvider, int maxUndesiredAllocationsToTrack) { + UndesiredAllocationsTracker(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; - this.maxUndesiredAllocationsToTrack = maxUndesiredAllocationsToTrack; this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); clusterSettings.initializeAndWatch( DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, From 2d3f10f33e95d6ae53ee8238ef895d981079fbc2 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 17:04:38 +1100 Subject: [PATCH 27/44] Tidy --- .../allocation/allocator/UndesiredAllocationsTracker.java | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 6a28b2ea6810e..7a3300d2259ce 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -77,11 +77,9 @@ public class UndesiredAllocationsTracker { * Track an allocation as being undesired */ public void trackUndesiredAllocation(ShardRouting shardRouting) { - if (undesiredAllocations.containsKey(shardRouting) == false) { - assert shardRouting.unassigned() == false : "Shouldn't record unassigned shards as undesired allocations"; - if (undesiredAllocations.size() < maxUndesiredAllocationsToTrack) { - undesiredAllocations.put(shardRouting, timeProvider.relativeTimeInMillis()); - } + assert shardRouting.unassigned() == false : "Shouldn't record unassigned shards as undesired allocations"; + if (undesiredAllocations.containsKey(shardRouting) == false && undesiredAllocations.size() < maxUndesiredAllocationsToTrack) { + undesiredAllocations.put(shardRouting, timeProvider.relativeTimeInMillis()); } } From ba678a93b2980d2461a7d8a59b3895f0d078749f Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Wed, 5 Nov 2025 17:09:07 +1100 Subject: [PATCH 28/44] Reduce default limit --- .../allocation/allocator/UndesiredAllocationsTracker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 7a3300d2259ce..4532301505a77 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -46,7 +46,7 @@ public class UndesiredAllocationsTracker { */ public static final Setting MAX_UNDESIRED_ALLOCATIONS_TO_TRACK = Setting.intSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.max_to_track", - 50, + 10, 0, 200, Setting.Property.Dynamic, From 6330ba91bf37d8c0a6d2d0e41e7d43f1edd9a4e7 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 10:30:20 +1100 Subject: [PATCH 29/44] clear -> removeTracking --- .../allocation/allocator/DesiredBalanceReconciler.java | 8 ++++---- .../allocation/allocator/UndesiredAllocationsTracker.java | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java index a7337afd26fc8..a4edc9fe28b74 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconciler.java @@ -494,7 +494,7 @@ private void moveShards() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node - undesiredAllocationsTracker.clear(shardRouting); + undesiredAllocationsTracker.removeTracking(shardRouting); continue; } @@ -535,7 +535,7 @@ private void moveShards() { } } finally { if (movedUndesiredShard) { - undesiredAllocationsTracker.clear(shardRouting); + undesiredAllocationsTracker.removeTracking(shardRouting); } else { undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); } @@ -572,7 +572,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { if (assignment.nodeIds().contains(shardRouting.currentNodeId())) { // shard is already on a desired node - undesiredAllocationsTracker.clear(shardRouting); + undesiredAllocationsTracker.removeTracking(shardRouting); continue; } @@ -621,7 +621,7 @@ private DesiredBalanceMetrics.AllocationStats balance() { } } finally { if (movedUndesiredShard) { - undesiredAllocationsTracker.clear(shardRouting); + undesiredAllocationsTracker.removeTracking(shardRouting); } else { undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 4532301505a77..e5d51a691f181 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -84,9 +84,9 @@ public void trackUndesiredAllocation(ShardRouting shardRouting) { } /** - * Clear any tracking of the provided allocation + * Remove any tracking of the specified allocation (a no-op if the allocation isn't being tracked) */ - public void clear(ShardRouting shardRouting) { + public void removeTracking(ShardRouting shardRouting) { undesiredAllocations.remove(shardRouting); } From 207dd0d2fa332003e4ff1c9f74b7801a89adc45b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 10:36:36 +1100 Subject: [PATCH 30/44] Update server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java Co-authored-by: Yang Wang --- .../allocation/allocator/UndesiredAllocationsTracker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index e5d51a691f181..ba3aaeb7716f5 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -170,7 +170,7 @@ private void logUndesiredShardDetails( logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration); for (final var nodeId : assignment.nodeIds()) { final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation); - logger.warn("Shard {} cannot be allocated on node [{}]: {}", shardRouting.shardId(), nodeId, decision); + logger.warn("Shard {} allocation decision for node [{}]: {}", shardRouting.shardId(), nodeId, decision); } } finally { allocation.setDebugMode(originalDebugMode); From 6d5d2fadec462443d3dfecf3bb545e18cb71470c Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 10:44:36 +1100 Subject: [PATCH 31/44] Add separate setting for logging interval, reduce default to 5 minutes. --- .../allocator/UndesiredAllocationsTracker.java | 13 ++++++++++++- .../common/settings/ClusterSettings.java | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index ba3aaeb7716f5..5f457a238183e 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -41,6 +41,17 @@ public class UndesiredAllocationsTracker { Setting.Property.NodeScope ); + /** + * The minimum amount of time between warnings about persistent undesired allocations + */ + public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_INTERVAL_SETTING = Setting.timeSetting( + "cluster.routing.allocation.desired_balance.undesired_duration_logging.interval", + TimeValue.timeValueMinutes(5), + TimeValue.timeValueMinutes(1), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + /** * The max number of undesired allocations to track. We expect this to be relatively small. */ @@ -63,7 +74,7 @@ public class UndesiredAllocationsTracker { this.timeProvider = timeProvider; this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); clusterSettings.initializeAndWatch( - DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, + UNDESIRED_ALLOCATION_DURATION_LOG_INTERVAL_SETTING, undesiredAllocationDurationLogInterval::setMinInterval ); clusterSettings.initializeAndWatch( diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 855840d526c55..5ad7c2af94c81 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -238,6 +238,7 @@ public void apply(Settings value, Settings current, Settings previous) { DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_INTERVAL_SETTING, DesiredBalanceReconciler.UNDESIRED_ALLOCATIONS_LOG_THRESHOLD_SETTING, UndesiredAllocationsTracker.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING, + UndesiredAllocationsTracker.UNDESIRED_ALLOCATION_DURATION_LOG_INTERVAL_SETTING, UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK, BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING, BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING, From 0af805059ba298e4f11bc99a3805af93d9ee5375 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 10:51:44 +1100 Subject: [PATCH 32/44] Check for capacity before we check for existing record --- .../allocation/allocator/UndesiredAllocationsTracker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 5f457a238183e..9fb8ef1bd25f5 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -89,7 +89,7 @@ public class UndesiredAllocationsTracker { */ public void trackUndesiredAllocation(ShardRouting shardRouting) { assert shardRouting.unassigned() == false : "Shouldn't record unassigned shards as undesired allocations"; - if (undesiredAllocations.containsKey(shardRouting) == false && undesiredAllocations.size() < maxUndesiredAllocationsToTrack) { + if (undesiredAllocations.size() < maxUndesiredAllocationsToTrack && undesiredAllocations.containsKey(shardRouting) == false) { undesiredAllocations.put(shardRouting, timeProvider.relativeTimeInMillis()); } } From 6db1bc39a748ec611f2a5269cd6397937c675822 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 11:03:41 +1100 Subject: [PATCH 33/44] Discard excess tracking if the limit is reduced --- .../UndesiredAllocationsTracker.java | 28 ++++++ .../DesiredBalanceReconcilerTests.java | 36 +------- .../UndesiredAllocationsTrackerTests.java | 87 +++++++++++++++++++ .../common/time/AdvancingTimeProvider.java | 46 ++++++++++ 4 files changed, 162 insertions(+), 35 deletions(-) create mode 100644 server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java create mode 100644 test/framework/src/main/java/org/elasticsearch/common/time/AdvancingTimeProvider.java diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 9fb8ef1bd25f5..c2d82d37e638f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -10,6 +10,7 @@ package org.elasticsearch.cluster.routing.allocation.allocator; import com.carrotsearch.hppc.ObjectLongHashMap; +import com.carrotsearch.hppc.ObjectLongMap; import org.elasticsearch.cluster.routing.RoutingNodes; import org.elasticsearch.cluster.routing.ShardRouting; @@ -19,9 +20,13 @@ import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.time.TimeProvider; import org.elasticsearch.core.TimeValue; +import org.elasticsearch.core.Tuple; import org.elasticsearch.logging.LogManager; import org.elasticsearch.logging.Logger; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + /** * Keeps track of a limited number of shards that are currently in undesired allocations. If the * shards remain in undesired allocations for longer than a configurable threshold, it will log @@ -114,6 +119,7 @@ public void cleanup(RoutingNodes routingNodes) { return true; } }); + shrinkIfOversized(); } /** @@ -187,4 +193,26 @@ private void logUndesiredShardDetails( allocation.setDebugMode(originalDebugMode); } } + + /** + * If the maximum to track was reduced, and we are tracking more than the new maximum, purge the most recent entries + * to bring us under the new limit + */ + private void shrinkIfOversized() { + if (undesiredAllocations.size() > maxUndesiredAllocationsToTrack) { + final var newestExcessValues = StreamSupport.stream(undesiredAllocations.spliterator(), false) + // we need to take a copy from the cursor because the cursors are re-used, so don't work with #sorted + .map(cursor -> new Tuple<>(cursor.key, cursor.value)) + .sorted((a, b) -> Long.compare(b.v2(), a.v2())) + .limit(undesiredAllocations.size() - maxUndesiredAllocationsToTrack) + .map(Tuple::v1) + .collect(Collectors.toSet()); + undesiredAllocations.removeAll(newestExcessValues::contains); + } + } + + // visible for testing + ObjectLongMap getUndesiredAllocations() { + return undesiredAllocations.clone(); + } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index 60d2959e7f0dd..d9ba8e7de0569 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -57,7 +57,7 @@ import org.elasticsearch.common.collect.ImmutableOpenMap; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.time.TimeProvider; +import org.elasticsearch.common.time.AdvancingTimeProvider; import org.elasticsearch.common.util.Maps; import org.elasticsearch.common.util.concurrent.DeterministicTaskQueue; import org.elasticsearch.core.TimeValue; @@ -84,7 +84,6 @@ import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.function.BiPredicate; import java.util.function.Consumer; @@ -92,7 +91,6 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import static java.util.concurrent.TimeUnit.NANOSECONDS; import static org.elasticsearch.cluster.ClusterInfo.shardIdentifierFromRouting; import static org.elasticsearch.cluster.routing.RoutingNodesHelper.shardsWithState; import static org.elasticsearch.cluster.routing.ShardRoutingState.STARTED; @@ -1750,36 +1748,4 @@ private static Settings throttleSettings() { .put(CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_OUTGOING_RECOVERIES_SETTING.getKey(), 1000) .build(); } - - /** - * A time-provider that advances each time it's asked the time - */ - private static class AdvancingTimeProvider implements TimeProvider { - - private final AtomicLong currentTimeMillis = new AtomicLong(); - - public void advanceByMillis(long milliseconds) { - currentTimeMillis.addAndGet(milliseconds); - } - - @Override - public long relativeTimeInMillis() { - return currentTimeMillis.incrementAndGet(); - } - - @Override - public long relativeTimeInNanos() { - return NANOSECONDS.toNanos(relativeTimeInMillis()); - } - - @Override - public long rawRelativeTimeInMillis() { - return relativeTimeInMillis(); - } - - @Override - public long absoluteTimeInMillis() { - throw new UnsupportedOperationException("not supported"); - } - } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java new file mode 100644 index 0000000000000..1c64de83a539b --- /dev/null +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java @@ -0,0 +1,87 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.cluster.routing.allocation.allocator; + +import org.elasticsearch.cluster.metadata.ProjectId; +import org.elasticsearch.cluster.node.DiscoveryNodeUtils; +import org.elasticsearch.cluster.node.DiscoveryNodes; +import org.elasticsearch.cluster.routing.GlobalRoutingTable; +import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.RecoverySource; +import org.elasticsearch.cluster.routing.RoutingNodes; +import org.elasticsearch.cluster.routing.RoutingTable; +import org.elasticsearch.cluster.routing.ShardRouting; +import org.elasticsearch.cluster.routing.UnassignedInfo; +import org.elasticsearch.common.settings.ClusterSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.time.AdvancingTimeProvider; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.test.ESTestCase; + +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + +public class UndesiredAllocationsTrackerTests extends ESTestCase { + + public void testNewestRecordsArePurgedWhenLimitIsDecreased() { + final var initialMaximum = randomIntBetween(10, 20); + final var clusterSettings = ClusterSettings.createBuiltInClusterSettings( + Settings.builder().put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), initialMaximum).build() + ); + final var advancingTimeProvider = new AdvancingTimeProvider(); + final var undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, advancingTimeProvider); + final var indexName = randomIdentifier(); + final var index = new Index(indexName, indexName); + final var indexRoutingTableBuilder = IndexRoutingTable.builder(index); + final var discoveryNodesBuilder = DiscoveryNodes.builder(); + + // The shards with the lowest IDs will have the earliest timestamps + for (int i = 0; i < initialMaximum; i++) { + final var routing = createAssignedRouting(index, i, discoveryNodesBuilder); + indexRoutingTableBuilder.addShard(routing); + undesiredAllocationsTracker.trackUndesiredAllocation(routing); + } + final var routingNodes = RoutingNodes.immutable( + GlobalRoutingTable.builder().put(ProjectId.DEFAULT, RoutingTable.builder().add(indexRoutingTableBuilder).build()).build(), + discoveryNodesBuilder.build() + ); + + // Reduce the maximum + final var reducedMaximum = randomIntBetween(1, initialMaximum); + clusterSettings.applySettings( + Settings.builder().put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), reducedMaximum).build() + ); + + // We shouldn't purge the entries from the setting updater thread + assertEquals(initialMaximum, undesiredAllocationsTracker.getUndesiredAllocations().size()); + + // We should purge the most recent entries in #cleanup + undesiredAllocationsTracker.cleanup(routingNodes); + assertEquals(reducedMaximum, undesiredAllocationsTracker.getUndesiredAllocations().size()); + final var remainingShardIds = StreamSupport.stream(undesiredAllocationsTracker.getUndesiredAllocations().spliterator(), false) + .map(olc -> olc.key.shardId().id()) + .collect(Collectors.toSet()); + assertEquals(IntStream.range(0, reducedMaximum).boxed().collect(Collectors.toSet()), remainingShardIds); + } + + private ShardRouting createAssignedRouting(Index index, int shardId, DiscoveryNodes.Builder discoveryNodesBuilder) { + final var nodeId = randomAlphaOfLength(10); + discoveryNodesBuilder.add(DiscoveryNodeUtils.create(nodeId)); + return ShardRouting.newUnassigned( + new ShardId(index, shardId), + true, + RecoverySource.EmptyStoreRecoverySource.INSTANCE, + new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, randomIdentifier()), + randomFrom(ShardRouting.Role.DEFAULT, ShardRouting.Role.INDEX_ONLY) + ).initialize(nodeId, null, randomNonNegativeLong()); + } +} diff --git a/test/framework/src/main/java/org/elasticsearch/common/time/AdvancingTimeProvider.java b/test/framework/src/main/java/org/elasticsearch/common/time/AdvancingTimeProvider.java new file mode 100644 index 0000000000000..d96a125cf741c --- /dev/null +++ b/test/framework/src/main/java/org/elasticsearch/common/time/AdvancingTimeProvider.java @@ -0,0 +1,46 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.common.time; + +import java.util.concurrent.atomic.AtomicLong; + +import static java.util.concurrent.TimeUnit.NANOSECONDS; + +/** + * A time-provider that advances each time it's asked the time + */ +public class AdvancingTimeProvider implements TimeProvider { + + private final AtomicLong currentTimeMillis = new AtomicLong(System.currentTimeMillis()); + + public void advanceByMillis(long milliseconds) { + currentTimeMillis.addAndGet(milliseconds); + } + + @Override + public long relativeTimeInMillis() { + return currentTimeMillis.incrementAndGet(); + } + + @Override + public long relativeTimeInNanos() { + return NANOSECONDS.toNanos(relativeTimeInMillis()); + } + + @Override + public long rawRelativeTimeInMillis() { + return relativeTimeInMillis(); + } + + @Override + public long absoluteTimeInMillis() { + throw new UnsupportedOperationException("not supported"); + } +} From 5fd07709aa61a4fe64b7561f84540b6833f11390 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 11:05:50 +1100 Subject: [PATCH 34/44] Fix test after log message changed --- .../allocation/allocator/DesiredBalanceReconcilerTests.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index d9ba8e7de0569..410be0eb85045 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -1528,7 +1528,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing "Should log the NO decisions", UndesiredAllocationsTracker.class.getCanonicalName(), Level.WARN, - "[" + shardToPreventMovement + "][0] cannot be allocated on node [data-node-2]: [NO(Blocks allocation on node 2)]" + "[" + shardToPreventMovement + "][0] allocation decision for node [data-node-2]: [NO(Blocks allocation on node 2)]" ) ); From 7dcb690b2fb60a8784084755423a240d556d588b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 12:27:57 +1100 Subject: [PATCH 35/44] Default max to track to zero --- .../allocation/allocator/UndesiredAllocationsTracker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index c2d82d37e638f..8d78977889782 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -62,7 +62,7 @@ public class UndesiredAllocationsTracker { */ public static final Setting MAX_UNDESIRED_ALLOCATIONS_TO_TRACK = Setting.intSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.max_to_track", - 10, + 0, 0, 200, Setting.Property.Dynamic, From ef8f45b3d7484aa4f10ebab5f160167b3cadaa3a Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 14:46:19 +1100 Subject: [PATCH 36/44] Add more unit tests --- .../UndesiredAllocationsTrackerTests.java | 99 +++++++++++++++++-- 1 file changed, 93 insertions(+), 6 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java index 1c64de83a539b..3abae8cf5798a 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java @@ -9,6 +9,8 @@ package org.elasticsearch.cluster.routing.allocation.allocator; +import org.elasticsearch.action.support.replication.ClusterStateCreationUtils; +import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.metadata.ProjectId; import org.elasticsearch.cluster.node.DiscoveryNodeUtils; import org.elasticsearch.cluster.node.DiscoveryNodes; @@ -22,6 +24,7 @@ import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.time.AdvancingTimeProvider; +import org.elasticsearch.core.Nullable; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.test.ESTestCase; @@ -32,6 +35,46 @@ public class UndesiredAllocationsTrackerTests extends ESTestCase { + public void testShardsArePrunedWhenRemovedFromRoutingTable() { + final int primaryShards = randomIntBetween(2, 5); + final int numberOfIndices = randomIntBetween(2, 5); + final int numberOfNodes = randomIntBetween(2, 5); + + final var clusterSettings = ClusterSettings.createBuiltInClusterSettings( + Settings.builder() + .put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), numberOfIndices * primaryShards) + .build() + ); + final var advancingTimeProvider = new AdvancingTimeProvider(); + final var undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, advancingTimeProvider); + + final var indexNames = IntStream.range(0, numberOfIndices).mapToObj(i -> "index-" + i).toArray(String[]::new); + final var state = ClusterStateCreationUtils.state(numberOfNodes, indexNames, primaryShards); + final var routingNodes = RoutingNodes.immutable(state.globalRoutingTable(), state.nodes()); + + // Mark all primary shards as undesired + routingNodes.forEach(routingNode -> { + routingNode.forEach(shardRouting -> { + if (shardRouting.primary()) { + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + } + }); + }); + assertEquals(numberOfIndices * primaryShards, undesiredAllocationsTracker.getUndesiredAllocations().size()); + + // Simulate an index being deleted + ClusterState stateWithIndexRemoved = removeRandomIndex(state); + + // Run cleanup with new RoutingNodes + undesiredAllocationsTracker.cleanup( + RoutingNodes.immutable(stateWithIndexRemoved.globalRoutingTable(), stateWithIndexRemoved.nodes()) + ); + assertEquals((numberOfIndices - 1) * primaryShards, undesiredAllocationsTracker.getUndesiredAllocations().size()); + undesiredAllocationsTracker.getUndesiredAllocations() + .iterator() + .forEachRemaining(olc -> assertNotNull(stateWithIndexRemoved.routingTable(ProjectId.DEFAULT).index(olc.key.index()))); + } + public void testNewestRecordsArePurgedWhenLimitIsDecreased() { final var initialMaximum = randomIntBetween(10, 20); final var clusterSettings = ClusterSettings.createBuiltInClusterSettings( @@ -42,17 +85,17 @@ public void testNewestRecordsArePurgedWhenLimitIsDecreased() { final var indexName = randomIdentifier(); final var index = new Index(indexName, indexName); final var indexRoutingTableBuilder = IndexRoutingTable.builder(index); - final var discoveryNodesBuilder = DiscoveryNodes.builder(); + final var discoveryNodes = randomDiscoveryNodes(randomIntBetween(initialMaximum / 2, initialMaximum)); // The shards with the lowest IDs will have the earliest timestamps for (int i = 0; i < initialMaximum; i++) { - final var routing = createAssignedRouting(index, i, discoveryNodesBuilder); + final var routing = createAssignedRouting(index, i, discoveryNodes); indexRoutingTableBuilder.addShard(routing); undesiredAllocationsTracker.trackUndesiredAllocation(routing); } final var routingNodes = RoutingNodes.immutable( GlobalRoutingTable.builder().put(ProjectId.DEFAULT, RoutingTable.builder().add(indexRoutingTableBuilder).build()).build(), - discoveryNodesBuilder.build() + discoveryNodes ); // Reduce the maximum @@ -73,9 +116,45 @@ public void testNewestRecordsArePurgedWhenLimitIsDecreased() { assertEquals(IntStream.range(0, reducedMaximum).boxed().collect(Collectors.toSet()), remainingShardIds); } - private ShardRouting createAssignedRouting(Index index, int shardId, DiscoveryNodes.Builder discoveryNodesBuilder) { - final var nodeId = randomAlphaOfLength(10); - discoveryNodesBuilder.add(DiscoveryNodeUtils.create(nodeId)); + public void testCannotTrackMoreShardsThanTheLimit() { + final int maxToTrack = randomIntBetween(1, 10); + final var clusterSettings = ClusterSettings.createBuiltInClusterSettings( + Settings.builder().put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), maxToTrack).build() + ); + final var advancingTimeProvider = new AdvancingTimeProvider(); + final var undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, advancingTimeProvider); + final var index = new Index(randomIdentifier(), randomIdentifier()); + + final int shardsToAdd = randomIntBetween(maxToTrack + 1, maxToTrack * 2); + for (int i = 0; i < shardsToAdd; i++) { + final var routing = createAssignedRouting(index, i); + undesiredAllocationsTracker.trackUndesiredAllocation(routing); + } + + // Only the first {maxToTrack} shards should be tracked + assertEquals(maxToTrack, undesiredAllocationsTracker.getUndesiredAllocations().size()); + final var trackedShardIds = StreamSupport.stream(undesiredAllocationsTracker.getUndesiredAllocations().spliterator(), false) + .map(olc -> olc.key.shardId().id()) + .collect(Collectors.toSet()); + assertEquals(IntStream.range(0, maxToTrack).boxed().collect(Collectors.toSet()), trackedShardIds); + } + + private ClusterState removeRandomIndex(ClusterState state) { + RoutingTable originalRoutingTable = state.routingTable(ProjectId.DEFAULT); + RoutingTable updatedRoutingTable = RoutingTable.builder(originalRoutingTable) + .remove(randomFrom(originalRoutingTable.indicesRouting().keySet())) + .build(); + return ClusterState.builder(state) + .routingTable(GlobalRoutingTable.builder().put(ProjectId.DEFAULT, updatedRoutingTable).build()) + .build(); + } + + private ShardRouting createAssignedRouting(Index index, int shardId) { + return createAssignedRouting(index, shardId, null); + } + + private ShardRouting createAssignedRouting(Index index, int shardId, @Nullable DiscoveryNodes discoveryNodes) { + final var nodeId = discoveryNodes == null ? randomAlphaOfLength(10) : randomFrom(discoveryNodes.getNodes().keySet()); return ShardRouting.newUnassigned( new ShardId(index, shardId), true, @@ -84,4 +163,12 @@ private ShardRouting createAssignedRouting(Index index, int shardId, DiscoveryNo randomFrom(ShardRouting.Role.DEFAULT, ShardRouting.Role.INDEX_ONLY) ).initialize(nodeId, null, randomNonNegativeLong()); } + + private DiscoveryNodes randomDiscoveryNodes(int numberOfNodes) { + final var nodes = DiscoveryNodes.builder(); + for (int i = 0; i < numberOfNodes; i++) { + nodes.add(DiscoveryNodeUtils.create("node-" + i)); + } + return nodes.build(); + } } From 19b74750dffcf392a43e3bb567b0836c820b3848 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 15:41:44 +1100 Subject: [PATCH 37/44] Make resilient to metadata changes --- .../UndesiredAllocationsTracker.java | 94 +++++++++++-------- .../UndesiredAllocationsTrackerTests.java | 53 +++++++++-- 2 files changed, 102 insertions(+), 45 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 8d78977889782..b9ff0f699ec49 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -9,9 +9,6 @@ package org.elasticsearch.cluster.routing.allocation.allocator; -import com.carrotsearch.hppc.ObjectLongHashMap; -import com.carrotsearch.hppc.ObjectLongMap; - import org.elasticsearch.cluster.routing.RoutingNodes; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; @@ -20,12 +17,13 @@ import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.time.TimeProvider; import org.elasticsearch.core.TimeValue; -import org.elasticsearch.core.Tuple; +import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.logging.LogManager; import org.elasticsearch.logging.Logger; +import java.util.HashMap; +import java.util.Map; import java.util.stream.Collectors; -import java.util.stream.StreamSupport; /** * Keeps track of a limited number of shards that are currently in undesired allocations. If the @@ -70,7 +68,7 @@ public class UndesiredAllocationsTracker { ); private final TimeProvider timeProvider; - private final ObjectLongHashMap undesiredAllocations = new ObjectLongHashMap<>(); + private final Map undesiredAllocations = new HashMap<>(); private final FrequencyCappedAction undesiredAllocationDurationLogInterval; private volatile TimeValue undesiredAllocationDurationLoggingThreshold; private volatile int maxUndesiredAllocationsToTrack; @@ -94,8 +92,14 @@ public class UndesiredAllocationsTracker { */ public void trackUndesiredAllocation(ShardRouting shardRouting) { assert shardRouting.unassigned() == false : "Shouldn't record unassigned shards as undesired allocations"; - if (undesiredAllocations.size() < maxUndesiredAllocationsToTrack && undesiredAllocations.containsKey(shardRouting) == false) { - undesiredAllocations.put(shardRouting, timeProvider.relativeTimeInMillis()); + if (undesiredAllocations.size() < maxUndesiredAllocationsToTrack) { + final var allocationId = shardRouting.allocationId().getId(); + if (undesiredAllocations.containsKey(allocationId) == false) { + undesiredAllocations.put( + allocationId, + new UndesiredAllocation(shardRouting.shardId(), timeProvider.relativeTimeInMillis()) + ); + } } } @@ -103,21 +107,21 @@ public void trackUndesiredAllocation(ShardRouting shardRouting) { * Remove any tracking of the specified allocation (a no-op if the allocation isn't being tracked) */ public void removeTracking(ShardRouting shardRouting) { - undesiredAllocations.remove(shardRouting); + if (shardRouting.unassigned() == false) { + undesiredAllocations.remove(shardRouting.allocationId().getId()); + } else { + assert false : "Shouldn't remove tracking of unassigned shards"; + } } /** * Clear any {@link ShardRouting} that are no longer present in the routing nodes */ public void cleanup(RoutingNodes routingNodes) { - undesiredAllocations.removeAll(shardRouting -> { - final var allocationId = shardRouting.allocationId(); - if (allocationId != null) { - return routingNodes.getByAllocationId(shardRouting.shardId(), allocationId.getId()) == null; - } else { - assert false : "Unassigned shards shouldn't be marked as undesired"; - return true; - } + undesiredAllocations.entrySet().removeIf(e -> { + final var undesiredAllocation = e.getValue(); + final var allocationId = e.getKey(); + return routingNodes.getByAllocationId(undesiredAllocation.shardId(), allocationId) == null; }); shrinkIfOversized(); } @@ -140,9 +144,9 @@ public void maybeLogUndesiredShardsWarning( ) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); long earliestUndesiredTimestamp = Long.MAX_VALUE; - for (var allocation : undesiredAllocations) { - if (allocation.value < earliestUndesiredTimestamp) { - earliestUndesiredTimestamp = allocation.value; + for (var undesiredAllocation : undesiredAllocations.values()) { + if (undesiredAllocation.undesiredSince() < earliestUndesiredTimestamp) { + earliestUndesiredTimestamp = undesiredAllocation.undesiredSince(); } } if (earliestUndesiredTimestamp < currentTimeMillis @@ -160,15 +164,22 @@ private void logDecisionsForUndesiredShardsOverThreshold( ) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); final long loggingThresholdTimestamp = currentTimeMillis - undesiredAllocationDurationLoggingThreshold.millis(); - for (var allocation : undesiredAllocations) { - if (allocation.value < loggingThresholdTimestamp) { - logUndesiredShardDetails( - allocation.key, - TimeValue.timeValueMillis(currentTimeMillis - allocation.value), - routingNodes, - routingAllocation, - desiredBalance - ); + for (var allocation : undesiredAllocations.entrySet()) { + final var undesiredAllocation = allocation.getValue(); + final var allocationId = allocation.getKey(); + if (undesiredAllocation.undesiredSince() < loggingThresholdTimestamp) { + final var shardRouting = routingNodes.getByAllocationId(undesiredAllocation.shardId(), allocationId); + if (shardRouting != null) { + logUndesiredShardDetails( + shardRouting, + TimeValue.timeValueMillis(currentTimeMillis - undesiredAllocation.undesiredSince()), + routingNodes, + routingAllocation, + desiredBalance + ); + } else { + assert false : undesiredAllocation + " for allocationID " + allocationId + " was not cleaned up"; + } } } } @@ -200,19 +211,28 @@ private void logUndesiredShardDetails( */ private void shrinkIfOversized() { if (undesiredAllocations.size() > maxUndesiredAllocationsToTrack) { - final var newestExcessValues = StreamSupport.stream(undesiredAllocations.spliterator(), false) - // we need to take a copy from the cursor because the cursors are re-used, so don't work with #sorted - .map(cursor -> new Tuple<>(cursor.key, cursor.value)) - .sorted((a, b) -> Long.compare(b.v2(), a.v2())) + final var newestExcessAllocationIds = undesiredAllocations.entrySet() + .stream() + .sorted((a, b) -> Long.compare(b.getValue().undesiredSince(), a.getValue().undesiredSince())) .limit(undesiredAllocations.size() - maxUndesiredAllocationsToTrack) - .map(Tuple::v1) + .map(Map.Entry::getKey) .collect(Collectors.toSet()); - undesiredAllocations.removeAll(newestExcessValues::contains); + undesiredAllocations.keySet().removeAll(newestExcessAllocationIds); } } // visible for testing - ObjectLongMap getUndesiredAllocations() { - return undesiredAllocations.clone(); + Map getUndesiredAllocations() { + return Map.copyOf(undesiredAllocations); } + + /** + * Rather than storing the {@link ShardRouting}, we store a map of allocationId -> {@link UndesiredAllocation} + * this is because the allocation ID will persist as long as a shard stays on the same node, but the + * {@link ShardRouting} changes for a variety of reasons even when the shard doesn't move. + * + * @param shardId The shard ID + * @param undesiredSince The timestamp when the shard was first observed in an undesired allocation + */ + record UndesiredAllocation(ShardId shardId, long undesiredSince) {} } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java index 3abae8cf5798a..b40844436b741 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTrackerTests.java @@ -31,7 +31,6 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import java.util.stream.StreamSupport; public class UndesiredAllocationsTrackerTests extends ESTestCase { @@ -70,9 +69,14 @@ public void testShardsArePrunedWhenRemovedFromRoutingTable() { RoutingNodes.immutable(stateWithIndexRemoved.globalRoutingTable(), stateWithIndexRemoved.nodes()) ); assertEquals((numberOfIndices - 1) * primaryShards, undesiredAllocationsTracker.getUndesiredAllocations().size()); - undesiredAllocationsTracker.getUndesiredAllocations() - .iterator() - .forEachRemaining(olc -> assertNotNull(stateWithIndexRemoved.routingTable(ProjectId.DEFAULT).index(olc.key.index()))); + assertTrue( + undesiredAllocationsTracker.getUndesiredAllocations() + .values() + .stream() + .allMatch( + allocation -> stateWithIndexRemoved.routingTable(ProjectId.DEFAULT).index(allocation.shardId().getIndex()) != null + ) + ); } public void testNewestRecordsArePurgedWhenLimitIsDecreased() { @@ -110,8 +114,10 @@ public void testNewestRecordsArePurgedWhenLimitIsDecreased() { // We should purge the most recent entries in #cleanup undesiredAllocationsTracker.cleanup(routingNodes); assertEquals(reducedMaximum, undesiredAllocationsTracker.getUndesiredAllocations().size()); - final var remainingShardIds = StreamSupport.stream(undesiredAllocationsTracker.getUndesiredAllocations().spliterator(), false) - .map(olc -> olc.key.shardId().id()) + final var remainingShardIds = undesiredAllocationsTracker.getUndesiredAllocations() + .values() + .stream() + .map(allocation -> allocation.shardId().id()) .collect(Collectors.toSet()); assertEquals(IntStream.range(0, reducedMaximum).boxed().collect(Collectors.toSet()), remainingShardIds); } @@ -133,12 +139,43 @@ public void testCannotTrackMoreShardsThanTheLimit() { // Only the first {maxToTrack} shards should be tracked assertEquals(maxToTrack, undesiredAllocationsTracker.getUndesiredAllocations().size()); - final var trackedShardIds = StreamSupport.stream(undesiredAllocationsTracker.getUndesiredAllocations().spliterator(), false) - .map(olc -> olc.key.shardId().id()) + final var trackedShardIds = undesiredAllocationsTracker.getUndesiredAllocations() + .values() + .stream() + .map(allocation -> allocation.shardId().id()) .collect(Collectors.toSet()); assertEquals(IntStream.range(0, maxToTrack).boxed().collect(Collectors.toSet()), trackedShardIds); } + public void testUndesiredAllocationsAreIdentifiableDespiteMetadataChanges() { + final var clusterSettings = ClusterSettings.createBuiltInClusterSettings( + Settings.builder().put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), randomIntBetween(1, 10)).build() + ); + final var advancingTimeProvider = new AdvancingTimeProvider(); + final var undesiredAllocationsTracker = new UndesiredAllocationsTracker(clusterSettings, advancingTimeProvider); + final var index = new Index(randomIdentifier(), randomIdentifier()); + + ShardRouting shardRouting = createAssignedRouting(index, 0); + + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + assertEquals(1, undesiredAllocationsTracker.getUndesiredAllocations().size()); + + // move to started + shardRouting = shardRouting.moveToStarted(randomNonNegativeLong()); + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + assertEquals(1, undesiredAllocationsTracker.getUndesiredAllocations().size()); + + // start a relocation + shardRouting = shardRouting.relocate(randomIdentifier(), randomNonNegativeLong()); + undesiredAllocationsTracker.trackUndesiredAllocation(shardRouting); + assertEquals(1, undesiredAllocationsTracker.getUndesiredAllocations().size()); + + // cancel that relocation + shardRouting = shardRouting.cancelRelocation(); + undesiredAllocationsTracker.removeTracking(shardRouting); + assertEquals(0, undesiredAllocationsTracker.getUndesiredAllocations().size()); + } + private ClusterState removeRandomIndex(ClusterState state) { RoutingTable originalRoutingTable = state.routingTable(ProjectId.DEFAULT); RoutingTable updatedRoutingTable = RoutingTable.builder(originalRoutingTable) From ed7caa428745292962364c47c99d3a932283c04b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 15:44:25 +1100 Subject: [PATCH 38/44] Explicitly configure max tracking in test --- .../allocation/allocator/DesiredBalanceReconcilerTests.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index 410be0eb85045..460e61125021d 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -1455,6 +1455,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing final var clusterSettings = createBuiltInClusterSettings( Settings.builder() .put(UndesiredAllocationsTracker.UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING.getKey(), undesiredAllocationThreshold) + .put(UndesiredAllocationsTracker.MAX_UNDESIRED_ALLOCATIONS_TO_TRACK.getKey(), 10) .build() ); final var timeProvider = new AdvancingTimeProvider(); From 53f95c07968eb38b4e01d7343e4920f96ca10b2c Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 15:49:29 +1100 Subject: [PATCH 39/44] de-dupe default time value --- .../allocator/UndesiredAllocationsTracker.java | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index b9ff0f699ec49..13a767a3c5aed 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -25,6 +25,11 @@ import java.util.Map; import java.util.stream.Collectors; +import static org.elasticsearch.core.TimeValue.ONE_MINUTE; +import static org.elasticsearch.core.TimeValue.ZERO; +import static org.elasticsearch.core.TimeValue.timeValueMillis; +import static org.elasticsearch.core.TimeValue.timeValueMinutes; + /** * Keeps track of a limited number of shards that are currently in undesired allocations. If the * shards remain in undesired allocations for longer than a configurable threshold, it will log @@ -34,12 +39,14 @@ public class UndesiredAllocationsTracker { private static final Logger logger = LogManager.getLogger(UndesiredAllocationsTracker.class); + private static final TimeValue FIVE_MINUTES = timeValueMinutes(5); + /** * Warning logs will be periodically written if we see a shard that's been in an undesired allocation for this long */ public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING = Setting.timeSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.threshold", - TimeValue.timeValueMinutes(5), + FIVE_MINUTES, Setting.Property.Dynamic, Setting.Property.NodeScope ); @@ -49,8 +56,8 @@ public class UndesiredAllocationsTracker { */ public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_INTERVAL_SETTING = Setting.timeSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.interval", - TimeValue.timeValueMinutes(5), - TimeValue.timeValueMinutes(1), + FIVE_MINUTES, + ONE_MINUTE, Setting.Property.Dynamic, Setting.Property.NodeScope ); @@ -75,7 +82,7 @@ public class UndesiredAllocationsTracker { UndesiredAllocationsTracker(ClusterSettings clusterSettings, TimeProvider timeProvider) { this.timeProvider = timeProvider; - this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, TimeValue.ZERO); + this.undesiredAllocationDurationLogInterval = new FrequencyCappedAction(timeProvider::relativeTimeInMillis, ZERO); clusterSettings.initializeAndWatch( UNDESIRED_ALLOCATION_DURATION_LOG_INTERVAL_SETTING, undesiredAllocationDurationLogInterval::setMinInterval @@ -172,7 +179,7 @@ private void logDecisionsForUndesiredShardsOverThreshold( if (shardRouting != null) { logUndesiredShardDetails( shardRouting, - TimeValue.timeValueMillis(currentTimeMillis - undesiredAllocation.undesiredSince()), + timeValueMillis(currentTimeMillis - undesiredAllocation.undesiredSince()), routingNodes, routingAllocation, desiredBalance From be8a38bcde085f9835fa94819d90cab70b5a7141 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 15:56:15 +1100 Subject: [PATCH 40/44] Remove conditional and leave assertion --- .../allocation/allocator/UndesiredAllocationsTracker.java | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 13a767a3c5aed..0570511c170ff 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -114,11 +114,8 @@ public void trackUndesiredAllocation(ShardRouting shardRouting) { * Remove any tracking of the specified allocation (a no-op if the allocation isn't being tracked) */ public void removeTracking(ShardRouting shardRouting) { - if (shardRouting.unassigned() == false) { - undesiredAllocations.remove(shardRouting.allocationId().getId()); - } else { - assert false : "Shouldn't remove tracking of unassigned shards"; - } + assert shardRouting.unassigned() == false : "Shouldn't remove tracking of unassigned shards"; + undesiredAllocations.remove(shardRouting.allocationId().getId()); } /** From 061e2ada06c80f7032fb413d09fceb9b858f35a3 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 15:56:59 +1100 Subject: [PATCH 41/44] Reduce maximum max-to-track --- .../allocation/allocator/UndesiredAllocationsTracker.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 0570511c170ff..09b66bef85233 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -63,13 +63,13 @@ public class UndesiredAllocationsTracker { ); /** - * The max number of undesired allocations to track. We expect this to be relatively small. + * The maximum number of undesired allocations to track. We expect this to be relatively small. */ public static final Setting MAX_UNDESIRED_ALLOCATIONS_TO_TRACK = Setting.intSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.max_to_track", 0, 0, - 200, + 100, Setting.Property.Dynamic, Setting.Property.NodeScope ); From 566f5d5699848d510eb98a24b2c0bb4183422a8e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 6 Nov 2025 16:51:59 +1100 Subject: [PATCH 42/44] Give duration threshold minimum of one minute --- .../allocation/allocator/UndesiredAllocationsTracker.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 09b66bef85233..43976723cff88 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -47,6 +47,7 @@ public class UndesiredAllocationsTracker { public static final Setting UNDESIRED_ALLOCATION_DURATION_LOG_THRESHOLD_SETTING = Setting.timeSetting( "cluster.routing.allocation.desired_balance.undesired_duration_logging.threshold", FIVE_MINUTES, + ONE_MINUTE, Setting.Property.Dynamic, Setting.Property.NodeScope ); From 655e3529b8c66ef9afa68d40ebfc82305a574d12 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 7 Nov 2025 09:51:41 +1100 Subject: [PATCH 43/44] Use ordered map to quickly determine earliest entry --- .../UndesiredAllocationsTracker.java | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java index 43976723cff88..804f580a22ede 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java @@ -21,7 +21,7 @@ import org.elasticsearch.logging.LogManager; import org.elasticsearch.logging.Logger; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; import java.util.stream.Collectors; @@ -76,7 +76,7 @@ public class UndesiredAllocationsTracker { ); private final TimeProvider timeProvider; - private final Map undesiredAllocations = new HashMap<>(); + private final LinkedHashMap undesiredAllocations = new LinkedHashMap<>(); private final FrequencyCappedAction undesiredAllocationDurationLogInterval; private volatile TimeValue undesiredAllocationDurationLoggingThreshold; private volatile int maxUndesiredAllocationsToTrack; @@ -148,18 +148,15 @@ public void maybeLogUndesiredShardsWarning( DesiredBalance desiredBalance ) { final long currentTimeMillis = timeProvider.relativeTimeInMillis(); - long earliestUndesiredTimestamp = Long.MAX_VALUE; - for (var undesiredAllocation : undesiredAllocations.values()) { - if (undesiredAllocation.undesiredSince() < earliestUndesiredTimestamp) { - earliestUndesiredTimestamp = undesiredAllocation.undesiredSince(); + if (undesiredAllocations.isEmpty() == false) { + final long earliestUndesiredTimestamp = undesiredAllocations.firstEntry().getValue().undesiredSince(); + if (earliestUndesiredTimestamp < currentTimeMillis + && currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { + undesiredAllocationDurationLogInterval.maybeExecute( + () -> logDecisionsForUndesiredShardsOverThreshold(routingNodes, routingAllocation, desiredBalance) + ); } } - if (earliestUndesiredTimestamp < currentTimeMillis - && currentTimeMillis - earliestUndesiredTimestamp > undesiredAllocationDurationLoggingThreshold.millis()) { - undesiredAllocationDurationLogInterval.maybeExecute( - () -> logDecisionsForUndesiredShardsOverThreshold(routingNodes, routingAllocation, desiredBalance) - ); - } } private void logDecisionsForUndesiredShardsOverThreshold( From feaca100dc4f4e3a72df5a48ab6078c3dd8bd221 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 7 Nov 2025 10:13:51 +1100 Subject: [PATCH 44/44] Add test cases where there is no desired balance --- .../DesiredBalanceReconcilerTests.java | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java index 460e61125021d..fb404ecdde8be 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/DesiredBalanceReconcilerTests.java @@ -1465,14 +1465,35 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing final var shardInUndesiredAllocationMessage = "Shard [" + shardToPreventMovement + "][0] has been in an undesired allocation for *"; - // Desired assignment matches current routing table, should not log + // Desired balance not yet computed, should not log assertThatLogger( () -> currentStateHolder.set( reconcileAndBuildNewState( reconciler, initialClusterState, + DesiredBalance.BECOME_MASTER_INITIAL, + preventAllocationOnNode2Decider, + yesDecider + ) + ), + UndesiredAllocationsTracker.class, + new MockLog.UnseenEventExpectation( + "Should not log if desired balance is not yet computed", + UndesiredAllocationsTracker.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ) + ); + + // Desired assignment matches current routing table, should not log + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + currentStateHolder.get(), new DesiredBalance(1, allShardsDesiredOnDataNode1), - preventAllocationOnNode2Decider + preventAllocationOnNode2Decider, + yesDecider ) ), UndesiredAllocationsTracker.class, @@ -1507,6 +1528,26 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing // Advance past the logging threshold timeProvider.advanceByMillis(randomLongBetween(undesiredAllocationThreshold.millis(), undesiredAllocationThreshold.millis() * 2)); + // If the desired balance is missing for some reason, we shouldn't log, and we shouldn't reset the became-undesired time + assertThatLogger( + () -> currentStateHolder.set( + reconcileAndBuildNewState( + reconciler, + currentStateHolder.get(), + new DesiredBalance(1, Map.of()), + preventAllocationOnNode2Decider, + yesDecider + ) + ), + UndesiredAllocationsTracker.class, + new MockLog.UnseenEventExpectation( + "Should not log because there is no desired allocations", + UndesiredAllocationsTracker.class.getCanonicalName(), + Level.WARN, + shardInUndesiredAllocationMessage + ) + ); + // Now it should log assertThatLogger( () -> currentStateHolder.set(