From 053edfa7316016ac25311cd33b9476581e5a6d36 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 19 Sep 2025 17:31:37 +1000 Subject: [PATCH 01/63] New new approach --- .../allocator/BalancedShardsAllocator.java | 147 +++++++++++++++--- 1 file changed, 124 insertions(+), 23 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 1255a1c784d92..4ec900b325c92 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -805,37 +805,102 @@ public boolean moveShards() { // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are // offloading the shards. + final Map bestNonPreferredShardsByNode = new HashMap<>(); + final Map comparatorCache = new HashMap<>(); for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { ShardRouting shardRouting = it.next(); ProjectIndex index = projectIndex(shardRouting); - final MoveDecision moveDecision = decideMove(index, shardRouting); + final MoveDecision moveDecision = decideMove(index, shardRouting, bestNonPreferredShardsByNode, comparatorCache); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { - final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); - final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); - sourceNode.removeShard(index, shardRouting); - Tuple relocatingShards = routingNodes.relocateShard( - shardRouting, - targetNode.getNodeId(), - allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), - "move", - allocation.changes() - ); - final ShardRouting shard = relocatingShards.v2(); - targetNode.addShard(projectIndex(shard), shard); - if (logger.isTraceEnabled()) { - logger.trace("Moved shard [{}] to node [{}]", shardRouting, targetNode.getRoutingNode()); - } - shardMoved = true; - if (completeEarlyOnShardAssignmentChange) { - return true; + // Defer moving of not-preferred until we've moved the NOs + if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { + bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), shardRouting); + } else { + executeMove(shardRouting, index, moveDecision); + shardMoved = true; + if (completeEarlyOnShardAssignmentChange) { + return true; + } } } else if (moveDecision.isDecisionTaken() && moveDecision.canRemain() == false) { logger.trace("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); } } + + // If we get here, attempt to move one of the best not-preferred shards that we identified earlier + for (var entry : bestNonPreferredShardsByNode.entrySet()) { + final var shardRouting = entry.getValue(); + final var index = projectIndex(shardRouting); + // Have to re-check move decision in case we made a move that invalidated our existing assessment + final MoveDecision moveDecision = decideMove(index, shardRouting); + if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { + executeMove(entry.getValue(), index, moveDecision); + // We only ever move a single non-preferred shard at a time + return true; + } + } + return shardMoved; } + private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision) { + final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); + final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); + sourceNode.removeShard(index, shardRouting); + Tuple relocatingShards = routingNodes.relocateShard( + shardRouting, + targetNode.getNodeId(), + allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), + "move", + allocation.changes() + ); + final ShardRouting shard = relocatingShards.v2(); + targetNode.addShard(projectIndex(shard), shard); + if (logger.isTraceEnabled()) { + logger.trace("Moved shard [{}] to node [{}]", shardRouting, targetNode.getRoutingNode()); + } + } + + private static class ShardMovementPriorityComparator implements Comparator { + + private final Map shardWriteLoads; + private final double lowThreshold; + private final double highThreshold; + + ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { + shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); + double maxWriteLoadOnNode = 0; + for (ShardRouting shardRouting : routingNode) { + maxWriteLoadOnNode = Math.max(maxWriteLoadOnNode, shardWriteLoads.getOrDefault(shardRouting.shardId(), 0.0)); + } + lowThreshold = maxWriteLoadOnNode * 0.5; + highThreshold = maxWriteLoadOnNode * 0.8; + } + + @Override + public int compare(ShardRouting o1, ShardRouting o2) { + // If we have no shard write-load data, shortcut + if (highThreshold == 0) { + return 0; + } + // Otherwise, we prefer middle, high, then low write-load shards + return Integer.compare( + getWriteLoadLevel(shardWriteLoads.getOrDefault(o1.shardId(), 0.0)), + getWriteLoadLevel(shardWriteLoads.getOrDefault(o2.shardId(), 0.0)) + ); + } + + private int getWriteLoadLevel(double writeLoad) { + if (writeLoad < lowThreshold) { + return 2; + } + if (writeLoad < highThreshold) { + return 0; + } + return 1; + } + } + /** * Makes a decision on whether to move a started shard to another node. The following rules apply * to the {@link MoveDecision} return object: @@ -848,7 +913,17 @@ public boolean moveShards() { * 4. If the method is invoked in explain mode (e.g. from the cluster allocation explain APIs), then * {@link MoveDecision#getNodeDecisions} will have a non-null value. */ - public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shardRouting) { + public MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting) { + // This is when we're not calling in a loop, so we don't need to keep track of "best non-preferred" shards + return decideMove(index, shardRouting, Map.of(), Map.of()); + } + + private MoveDecision decideMove( + ProjectIndex index, + ShardRouting shardRouting, + Map bestNonPreferredShardsByNode, + Map comparatorCache + ) { NodeSorter sorter = nodeSorters.sorterForShard(shardRouting); index.assertMatch(shardRouting); @@ -861,10 +936,22 @@ public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shar assert sourceNode != null && sourceNode.containsShard(index, shardRouting); RoutingNode routingNode = sourceNode.getRoutingNode(); Decision canRemain = allocation.deciders().canRemain(shardRouting, routingNode, allocation); - if (canRemain.type() != Decision.Type.NO) { + if (canRemain.type() != Decision.Type.NO && canRemain.type() != Type.NOT_PREFERRED) { return MoveDecision.remain(canRemain); } + // Investigate whether it's potentially a better move to make than the one we've discovered already + if (canRemain.type() == Type.NOT_PREFERRED && bestNonPreferredShardsByNode.containsKey(shardRouting.currentNodeId())) { + int compare = comparatorCache.computeIfAbsent( + shardRouting.currentNodeId(), + nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) + ).compare(shardRouting, bestNonPreferredShardsByNode.get(shardRouting.currentNodeId())); + if (compare <= 0) { + // Ignore inferior non-preferred moves + return MoveDecision.NOT_TAKEN; + } + } + sorter.reset(index); /* * the sorter holds the minimum weight node first for the shards index. @@ -872,8 +959,13 @@ public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shar * This is not guaranteed to be balanced after this operation we still try best effort to * allocate on the minimal eligible node. */ - MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanAllocate); - if (moveDecision.canRemain() == false && moveDecision.forceMove() == false) { + BiFunction decider = canRemain.type() == Type.NOT_PREFERRED + ? this::decideCanAllocatePreferredOnly + : this::decideCanAllocate; + MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, decider); + if (moveDecision.getCanRemainDecision().type() == Type.NO + && moveDecision.canRemain() == false + && moveDecision.forceMove() == false) { final boolean shardsOnReplacedNode = allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId(), REPLACE); if (shardsOnReplacedNode) { return decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanForceAllocateForVacate); @@ -924,6 +1016,15 @@ private MoveDecision decideMove( ); } + private Decision decideCanAllocatePreferredOnly(ShardRouting shardRouting, RoutingNode target) { + Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation); + // not-preferred means no here + if (decision.type() == Type.NOT_PREFERRED) { + return Decision.NO; + } + return decision; + } + private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { // don't use canRebalance as we want hard filtering rules to apply. See #17698 return allocation.deciders().canAllocate(shardRouting, target, allocation); From e38191a2a55aa10072a821d8d7f660f536953ede Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 19 Sep 2025 17:34:31 +1000 Subject: [PATCH 02/63] Add logging when moveDecision is stale --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 4ec900b325c92..8d228c1374eae 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -837,6 +837,8 @@ public boolean moveShards() { executeMove(entry.getValue(), index, moveDecision); // We only ever move a single non-preferred shard at a time return true; + } else { + logger.trace("[{}][{}] can no longer move (not-preferred)", shardRouting.index(), shardRouting.id()); } } From 909fd6dea917ffd9aea2838806cc38ecd9705ed1 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Fri, 19 Sep 2025 18:19:04 +1000 Subject: [PATCH 03/63] assert we're not abusing the comparator --- .../allocation/allocator/BalancedShardsAllocator.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 8d228c1374eae..822623a8a6bdd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -52,6 +52,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Set; import java.util.function.BiFunction; @@ -868,6 +869,7 @@ private static class ShardMovementPriorityComparator implements Comparator shardWriteLoads; private final double lowThreshold; private final double highThreshold; + private final String nodeId; ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); @@ -877,10 +879,13 @@ private static class ShardMovementPriorityComparator implements Comparator Date: Mon, 22 Sep 2025 12:26:21 +1000 Subject: [PATCH 04/63] Change comparator ranking, test --- .../allocator/BalancedShardsAllocator.java | 73 +++++++---- .../BalancedShardsAllocatorTests.java | 121 +++++++++++++++++- 2 files changed, 167 insertions(+), 27 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 822623a8a6bdd..2e4e2da4c7399 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -864,47 +864,74 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci } } - private static class ShardMovementPriorityComparator implements Comparator { + /** + * Sorts shards by desirability to move, ranking goes (in descending priority order) + *
    + *
  1. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  2. + *
  3. Shards with write-load in {threshold} -> 0
  4. + *
  5. Shards with maximum write-load
  6. + *
  7. Shards with missing write-load
  8. + *
+ */ + // Visible for testing + static class ShardMovementPriorityComparator implements Comparator { private final Map shardWriteLoads; - private final double lowThreshold; - private final double highThreshold; + private final double maxWriteLoadOnNode; + private final double threshold; private final String nodeId; ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); - double maxWriteLoadOnNode = 0; + double maxWriteLoadOnNode = -1; for (ShardRouting shardRouting : routingNode) { maxWriteLoadOnNode = Math.max(maxWriteLoadOnNode, shardWriteLoads.getOrDefault(shardRouting.shardId(), 0.0)); } - lowThreshold = maxWriteLoadOnNode * 0.5; - highThreshold = maxWriteLoadOnNode * 0.8; + this.maxWriteLoadOnNode = maxWriteLoadOnNode; + threshold = maxWriteLoadOnNode * 0.5; nodeId = routingNode.nodeId(); } @Override - public int compare(ShardRouting o1, ShardRouting o2) { - assert Objects.equals(nodeId, o1.currentNodeId()) && Objects.equals(nodeId, o2.currentNodeId()) - : "ShardMovementPriorityComparator is node-specific"; + public int compare(ShardRouting lhs, ShardRouting rhs) { + assert Objects.equals(nodeId, lhs.currentNodeId()) && Objects.equals(nodeId, rhs.currentNodeId()) + : this.getClass().getSimpleName() + + " is node-specific. Comparator node ID=" + + nodeId + + ", lhs=" + + lhs.currentNodeId() + + ", rhs=" + + rhs.currentNodeId(); // If we have no shard write-load data, shortcut - if (highThreshold == 0) { + if (maxWriteLoadOnNode == -1) { return 0; } // Otherwise, we prefer middle, high, then low write-load shards - return Integer.compare( - getWriteLoadLevel(shardWriteLoads.getOrDefault(o1.shardId(), 0.0)), - getWriteLoadLevel(shardWriteLoads.getOrDefault(o2.shardId(), 0.0)) - ); - } - - private int getWriteLoadLevel(double writeLoad) { - if (writeLoad < lowThreshold) { - return 2; - } - if (writeLoad < highThreshold) { - return 0; + double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), -1.0); + double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), -1.0); + + if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { + if (lhsWriteLoad >= threshold && rhsWriteLoad >= threshold) { + // Both values between threshold and maximum, prefer lowest + return (int) Math.signum(rhsWriteLoad - lhsWriteLoad); + } else if (lhsWriteLoad >= threshold && rhsWriteLoad < threshold) { + // lhs between threshold and maximum, rhs below threshold, prefer lhs + return 1; + } else if (lhsWriteLoad < threshold && rhsWriteLoad >= threshold) { + // lhs below threshold, rhs between threshold and maximum, prefer rhs + return -1; + } else { + // Both values below the threshold, prefer highest + return (int) Math.signum(lhsWriteLoad - rhsWriteLoad); + } + } else { + // one of the shards is the max-write-load shard, prefer any present write load over it + if (shardWriteLoads.containsKey(rhs.shardId()) ^ shardWriteLoads.containsKey(lhs.shardId())) { + return shardWriteLoads.containsKey(rhs.shardId()) ? -1 : 1; + } else { + return lhsWriteLoad == rhsWriteLoad ? 0 : (int) Math.signum(rhsWriteLoad - lhsWriteLoad); + } } - return 1; } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 2bacd38325f41..ca5c71edb0711 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -27,6 +27,7 @@ import org.elasticsearch.cluster.routing.AllocationId; import org.elasticsearch.cluster.routing.GlobalRoutingTable; import org.elasticsearch.cluster.routing.IndexRoutingTable; +import org.elasticsearch.cluster.routing.RoutingChangesObserver; import org.elasticsearch.cluster.routing.RoutingNode; import org.elasticsearch.cluster.routing.RoutingNodes; import org.elasticsearch.cluster.routing.RoutingNodesHelper; @@ -51,7 +52,9 @@ import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.snapshots.SnapshotShardSizeInfo; import org.elasticsearch.test.gateway.TestGatewayAllocator; +import org.hamcrest.Matchers; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -59,6 +62,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.DoubleSupplier; import java.util.function.Function; import java.util.stream.Collector; import java.util.stream.Collectors; @@ -78,6 +82,7 @@ import static org.hamcrest.Matchers.containsInAnyOrder; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.everyItem; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.hasSize; import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.lessThanOrEqualTo; @@ -86,6 +91,8 @@ public class BalancedShardsAllocatorTests extends ESAllocationTestCase { + private static final RoutingChangesObserver NOOP = new RoutingChangesObserver() { + }; private static final Settings WITH_DISK_BALANCING = Settings.builder().put(DISK_USAGE_BALANCE_FACTOR_SETTING.getKey(), "1e-9").build(); public void testDecideShardAllocation() { @@ -809,6 +816,112 @@ public void testReturnEarlyOnShardAssignmentChanges() { applyStartedShardsUntilNoChange(clusterState, allocationService); } + public void testShardMovementPriorityComparator() { + final double maxWriteLoad = randomDoubleBetween(0.0, 100.0, true); + final double lowThreshold = maxWriteLoad * 0.5; + final int numAtMax = between(1, 2); + final int numBetweenLowAndMax = between(1, 50); + final int numBelowLow = between(1, 50); + final int numMissing = between(1, 50); + final int totalShards = numAtMax + numBetweenLowAndMax + numBelowLow + numMissing; + + final var indices = new ArrayList(); + for (int i = 0; i < totalShards; i++) { + indices.add(anIndex("index-" + i).numberOfShards(1).numberOfReplicas(0)); + } + + final var nodeId = randomIdentifier(); + final var stateWithIndices = createStateWithIndices( + List.of(nodeId), + shardId -> nodeId, + indices.toArray(IndexMetadata.Builder[]::new) + ); + + final var allShards = stateWithIndices.routingTable(ProjectId.DEFAULT).allShards().collect(toSet()); + final var shardWriteLoads = new HashMap(); + addRandomWriteLoads(shardWriteLoads, allShards, numAtMax, () -> maxWriteLoad); + addRandomWriteLoads(shardWriteLoads, allShards, numBetweenLowAndMax, () -> randomDoubleBetween(lowThreshold, maxWriteLoad, true)); + addRandomWriteLoads(shardWriteLoads, allShards, numBelowLow, () -> randomDoubleBetween(0, lowThreshold, true)); + assertThat(allShards, hasSize(numMissing)); + + final var allocation = new RoutingAllocation( + new AllocationDeciders(List.of()), + stateWithIndices, + ClusterInfo.builder().shardWriteLoads(shardWriteLoads).build(), + SNAPSHOT_INFO_SERVICE_WITH_NO_SHARD_SIZES.snapshotShardSizes(), + System.nanoTime() + ); + + // Assign all shards to node + final var allocatedRoutingNodes = allocation.routingNodes().mutableCopy(); + for (ShardRouting shardRouting : allocatedRoutingNodes.unassigned()) { + logger.info("--> allocating shard [{}] to node [{}]", shardRouting, nodeId); + allocatedRoutingNodes.initializeShard(shardRouting, nodeId, null, randomNonNegativeLong(), NOOP); + } + + final var comparator = new BalancedShardsAllocator.Balancer.ShardMovementPriorityComparator( + allocation, + allocatedRoutingNodes.node(nodeId) + ); + + logger.info("--> testing shard movement priority comparator, maxValue={}, threshold={}", maxWriteLoad, lowThreshold); + var sortedShards = allocatedRoutingNodes.getAssignedShards().values().stream().flatMap(List::stream).sorted(comparator).toList(); + + for (ShardRouting shardRouting : sortedShards) { + logger.info("--> {}: {}", shardRouting.shardId(), shardWriteLoads.getOrDefault(shardRouting.shardId(), -1.0)); + } + + double lastWriteLoad = 0.0; + int currentIndex = 0; + logger.info("--> expecting {} missing", numMissing); + for (int i = 0; i < numMissing; i++) { + final var currentShardId = sortedShards.get(currentIndex++); + assertThat(shardWriteLoads, Matchers.not(Matchers.hasKey(currentShardId.shardId()))); + } + logger.info("--> expecting {} at max", numAtMax); + for (int i = 0; i < numAtMax; i++) { + final var currentShardId = sortedShards.get(currentIndex++).shardId(); + assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); + double currentWriteLoad = shardWriteLoads.get(currentShardId); + assertThat(currentWriteLoad, equalTo(maxWriteLoad)); + } + logger.info("--> expecting {} below low in ascending order", numBelowLow); + for (int i = 0; i < numBelowLow; i++) { + final var currentShardId = sortedShards.get(currentIndex++).shardId(); + assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); + double currentWriteLoad = shardWriteLoads.get(currentShardId); + if (i == 0) { + lastWriteLoad = currentWriteLoad; + } else { + assertThat(currentWriteLoad, greaterThanOrEqualTo(lastWriteLoad)); + } + } + logger.info("--> expecting {} between low and max in descending order", numBetweenLowAndMax); + for (int i = 0; i < numBetweenLowAndMax; i++) { + final var currentShardId = sortedShards.get(currentIndex++).shardId(); + assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); + double currentWriteLoad = shardWriteLoads.get(currentShardId); + if (i == 0) { + lastWriteLoad = currentWriteLoad; + } else { + assertThat(currentWriteLoad, lessThanOrEqualTo(lastWriteLoad)); + } + } + } + + private void addRandomWriteLoads( + Map shardWriteLoads, + Set shards, + int count, + DoubleSupplier writeLoadSupplier + ) { + for (int i = 0; i < count; i++) { + final var shardRouting = randomFrom(shards); + shardWriteLoads.put(shardRouting.shardId(), writeLoadSupplier.getAsDouble()); + shards.remove(shardRouting); + } + } + private Map getTargetShardPerNodeCount(IndexRoutingTable indexRoutingTable) { var counts = new HashMap(); for (int shardId = 0; shardId < indexRoutingTable.size(); shardId++) { @@ -850,7 +963,7 @@ private static ClusterState createStateWithIndices(IndexMetadata.Builder... inde private static ClusterState createStateWithIndices( List nodeNames, - Function unbalancedAllocator, + Function shardAllocator, IndexMetadata.Builder... indexMetadataBuilders ) { var metadataBuilder = Metadata.builder(); @@ -873,9 +986,9 @@ private static ClusterState createStateWithIndices( routingTableBuilder.add( IndexRoutingTable.builder(indexMetadata.getIndex()) .addShard( - shardRoutingBuilder(shardId, unbalancedAllocator.apply(shardId), true, ShardRoutingState.STARTED) - .withAllocationId(AllocationId.newInitializing(inSyncId)) - .build() + shardRoutingBuilder(shardId, shardAllocator.apply(shardId), true, ShardRoutingState.STARTED).withAllocationId( + AllocationId.newInitializing(inSyncId) + ).build() ) ); } From 10c042d15d54e16a422e5192ded26896b2719489 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 22 Sep 2025 02:36:53 +0000 Subject: [PATCH 05/63] [CI] Update transport version definitions --- server/src/main/resources/transport/upper_bounds/8.18.csv | 2 +- server/src/main/resources/transport/upper_bounds/8.19.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.0.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.1.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.2.csv | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/main/resources/transport/upper_bounds/8.18.csv b/server/src/main/resources/transport/upper_bounds/8.18.csv index 4eb5140004ea6..ffc592e1809ee 100644 --- a/server/src/main/resources/transport/upper_bounds/8.18.csv +++ b/server/src/main/resources/transport/upper_bounds/8.18.csv @@ -1 +1 @@ -initial_elasticsearch_8_18_6,8840008 +initial_elasticsearch_8_18_8,8840010 diff --git a/server/src/main/resources/transport/upper_bounds/8.19.csv b/server/src/main/resources/transport/upper_bounds/8.19.csv index 476468b203875..3cc6f439c5ea5 100644 --- a/server/src/main/resources/transport/upper_bounds/8.19.csv +++ b/server/src/main/resources/transport/upper_bounds/8.19.csv @@ -1 +1 @@ -initial_elasticsearch_8_19_3,8841067 +initial_elasticsearch_8_19_5,8841069 diff --git a/server/src/main/resources/transport/upper_bounds/9.0.csv b/server/src/main/resources/transport/upper_bounds/9.0.csv index f8f50cc6d7839..8ad2ed1a4cacf 100644 --- a/server/src/main/resources/transport/upper_bounds/9.0.csv +++ b/server/src/main/resources/transport/upper_bounds/9.0.csv @@ -1 +1 @@ -initial_elasticsearch_9_0_6,9000015 +initial_elasticsearch_9_0_8,9000017 diff --git a/server/src/main/resources/transport/upper_bounds/9.1.csv b/server/src/main/resources/transport/upper_bounds/9.1.csv index 5a65f2e578156..1cea5dc4d929b 100644 --- a/server/src/main/resources/transport/upper_bounds/9.1.csv +++ b/server/src/main/resources/transport/upper_bounds/9.1.csv @@ -1 +1 @@ -initial_elasticsearch_9_1_4,9112007 +initial_elasticsearch_9_1_5,9112008 diff --git a/server/src/main/resources/transport/upper_bounds/9.2.csv b/server/src/main/resources/transport/upper_bounds/9.2.csv index e24f914a1d1ca..bf1a90e5be4e9 100644 --- a/server/src/main/resources/transport/upper_bounds/9.2.csv +++ b/server/src/main/resources/transport/upper_bounds/9.2.csv @@ -1 +1 @@ -ml_inference_endpoint_cache,9157000 +index_request_include_tsid,9167000 From f03e4b8eeeed9cbc2271858395188d52d3a96220 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 12:40:02 +1000 Subject: [PATCH 06/63] Use constant for missing write load --- .../allocation/allocator/BalancedShardsAllocator.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 2e4e2da4c7399..c5f0061d2e944 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -876,6 +876,7 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci // Visible for testing static class ShardMovementPriorityComparator implements Comparator { + private static final double MISSING_WRITE_LOAD = -1; private final Map shardWriteLoads; private final double maxWriteLoadOnNode; private final double threshold; @@ -883,9 +884,12 @@ static class ShardMovementPriorityComparator implements Comparator ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); - double maxWriteLoadOnNode = -1; + double maxWriteLoadOnNode = MISSING_WRITE_LOAD; for (ShardRouting shardRouting : routingNode) { - maxWriteLoadOnNode = Math.max(maxWriteLoadOnNode, shardWriteLoads.getOrDefault(shardRouting.shardId(), 0.0)); + maxWriteLoadOnNode = Math.max( + maxWriteLoadOnNode, + shardWriteLoads.getOrDefault(shardRouting.shardId(), MISSING_WRITE_LOAD) + ); } this.maxWriteLoadOnNode = maxWriteLoadOnNode; threshold = maxWriteLoadOnNode * 0.5; @@ -903,7 +907,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { + ", rhs=" + rhs.currentNodeId(); // If we have no shard write-load data, shortcut - if (maxWriteLoadOnNode == -1) { + if (maxWriteLoadOnNode == MISSING_WRITE_LOAD) { return 0; } // Otherwise, we prefer middle, high, then low write-load shards From 9112b1dd72d5b9774cc57b8aae9a7c9c6ea9af84 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 12:45:54 +1000 Subject: [PATCH 07/63] Distinguish between move and move-not-preferred movements --- .../allocation/allocator/BalancedShardsAllocator.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index c5f0061d2e944..9273f0bf7c85a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -817,7 +817,7 @@ public boolean moveShards() { if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), shardRouting); } else { - executeMove(shardRouting, index, moveDecision); + executeMove(shardRouting, index, moveDecision, "move"); shardMoved = true; if (completeEarlyOnShardAssignmentChange) { return true; @@ -835,7 +835,7 @@ public boolean moveShards() { // Have to re-check move decision in case we made a move that invalidated our existing assessment final MoveDecision moveDecision = decideMove(index, shardRouting); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { - executeMove(entry.getValue(), index, moveDecision); + executeMove(entry.getValue(), index, moveDecision, "move-non-preferred"); // We only ever move a single non-preferred shard at a time return true; } else { @@ -846,7 +846,7 @@ public boolean moveShards() { return shardMoved; } - private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision) { + private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision, String reason) { final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); sourceNode.removeShard(index, shardRouting); @@ -854,7 +854,7 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci shardRouting, targetNode.getNodeId(), allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), - "move", + reason, allocation.changes() ); final ShardRouting shard = relocatingShards.v2(); From 49d11e308b7b6a88693ab49f318054bd98071d2d Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 13:00:31 +1000 Subject: [PATCH 08/63] Update javadoc --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 9273f0bf7c85a..ecb1d57447aaf 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -792,7 +792,7 @@ protected int comparePivot(int j) { } /** - * Move started shards that can not be allocated to a node anymore + * Move started shards that cannot be allocated to a node anymore, or are in a non-preferred allocation * * For each shard to be moved this function executes a move operation * to the minimal eligible node with respect to the From 7e23a15ceb900c6748e647c5ae30d12f28fe55d5 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 13:38:41 +1000 Subject: [PATCH 09/63] Tweak test params --- .../allocator/BalancedShardsAllocatorTests.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index ca5c71edb0711..cd9dc48d2ecda 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -819,10 +819,10 @@ public void testReturnEarlyOnShardAssignmentChanges() { public void testShardMovementPriorityComparator() { final double maxWriteLoad = randomDoubleBetween(0.0, 100.0, true); final double lowThreshold = maxWriteLoad * 0.5; - final int numAtMax = between(1, 2); - final int numBetweenLowAndMax = between(1, 50); - final int numBelowLow = between(1, 50); - final int numMissing = between(1, 50); + final int numAtMax = between(1, 5); + final int numBetweenLowAndMax = between(0, 50); + final int numBelowLow = between(0, 50); + final int numMissing = between(0, 50); final int totalShards = numAtMax + numBetweenLowAndMax + numBelowLow + numMissing; final var indices = new ArrayList(); From fcde6c6dc974536e4a314401ec627fdf0856574c Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 13:50:53 +1000 Subject: [PATCH 10/63] Simplify logic --- .../allocation/allocator/BalancedShardsAllocator.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index ecb1d57447aaf..5cf4f598ec148 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -929,11 +929,13 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { return (int) Math.signum(lhsWriteLoad - rhsWriteLoad); } } else { - // one of the shards is the max-write-load shard, prefer any present write load over it + // one of the shards is the max-write-load shard if (shardWriteLoads.containsKey(rhs.shardId()) ^ shardWriteLoads.containsKey(lhs.shardId())) { + // prefer any known write-load over it return shardWriteLoads.containsKey(rhs.shardId()) ? -1 : 1; } else { - return lhsWriteLoad == rhsWriteLoad ? 0 : (int) Math.signum(rhsWriteLoad - lhsWriteLoad); + // prefer the lowest (non-max) write load + return (int) Math.signum(rhsWriteLoad - lhsWriteLoad); } } } From ceff1e3513e69f3ea087f2ead1c57cea6c8e7c20 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 14:10:32 +1000 Subject: [PATCH 11/63] Simplify logic --- .../allocation/allocator/BalancedShardsAllocator.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 5cf4f598ec148..2580de062f227 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -930,9 +930,11 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { } } else { // one of the shards is the max-write-load shard - if (shardWriteLoads.containsKey(rhs.shardId()) ^ shardWriteLoads.containsKey(lhs.shardId())) { + final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; + final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; + if (rhsIsMissing ^ lhsIsMissing) { // prefer any known write-load over it - return shardWriteLoads.containsKey(rhs.shardId()) ? -1 : 1; + return lhsIsMissing ? -1 : 1; } else { // prefer the lowest (non-max) write load return (int) Math.signum(rhsWriteLoad - lhsWriteLoad); From 6dcb6f0ac440c0f9ad55ba1cceb5332eca1b6550 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 14:32:13 +1000 Subject: [PATCH 12/63] Simplify logic --- .../allocator/BalancedShardsAllocator.java | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 2580de062f227..f47fbd079e8fc 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -915,18 +915,20 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), -1.0); if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { - if (lhsWriteLoad >= threshold && rhsWriteLoad >= threshold) { + final var lhsOverThreshold = lhsWriteLoad >= threshold; + final var rhsOverThreshold = rhsWriteLoad >= threshold; + if (lhsOverThreshold && rhsOverThreshold) { // Both values between threshold and maximum, prefer lowest - return (int) Math.signum(rhsWriteLoad - lhsWriteLoad); - } else if (lhsWriteLoad >= threshold && rhsWriteLoad < threshold) { + return Double.compare(rhsWriteLoad, lhsWriteLoad); + } else if (lhsOverThreshold) { // lhs between threshold and maximum, rhs below threshold, prefer lhs return 1; - } else if (lhsWriteLoad < threshold && rhsWriteLoad >= threshold) { + } else if (rhsOverThreshold) { // lhs below threshold, rhs between threshold and maximum, prefer rhs return -1; } else { // Both values below the threshold, prefer highest - return (int) Math.signum(lhsWriteLoad - rhsWriteLoad); + return Double.compare(lhsWriteLoad, rhsWriteLoad); } } else { // one of the shards is the max-write-load shard @@ -937,7 +939,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { return lhsIsMissing ? -1 : 1; } else { // prefer the lowest (non-max) write load - return (int) Math.signum(rhsWriteLoad - lhsWriteLoad); + return Double.compare(rhsWriteLoad, lhsWriteLoad); } } } From c03bcded49cefd46f5b4119c4f9b25f37fde68b2 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 14:35:46 +1000 Subject: [PATCH 13/63] Simplify logic --- .../allocator/BalancedShardsAllocator.java | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index f47fbd079e8fc..dc1e4b0b34319 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -926,21 +926,20 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { } else if (rhsOverThreshold) { // lhs below threshold, rhs between threshold and maximum, prefer rhs return -1; - } else { - // Both values below the threshold, prefer highest - return Double.compare(lhsWriteLoad, rhsWriteLoad); } + // Both values below the threshold, prefer highest + return Double.compare(lhsWriteLoad, rhsWriteLoad); + } + + // at least one of the shards is the max-write-load shard + final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; + final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; + if (rhsIsMissing ^ lhsIsMissing) { + // prefer any known write-load over it + return lhsIsMissing ? -1 : 1; } else { - // one of the shards is the max-write-load shard - final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; - final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; - if (rhsIsMissing ^ lhsIsMissing) { - // prefer any known write-load over it - return lhsIsMissing ? -1 : 1; - } else { - // prefer the lowest (non-max) write load - return Double.compare(rhsWriteLoad, lhsWriteLoad); - } + // prefer the lowest (non-max) write load + return Double.compare(rhsWriteLoad, lhsWriteLoad); } } } From fdca4048489609dd082dab4dce5b766c7987ebc3 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 14:40:10 +1000 Subject: [PATCH 14/63] Tidy --- .../allocation/allocator/BalancedShardsAllocator.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index dc1e4b0b34319..ac53641df3b52 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -52,7 +52,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.function.BiFunction; @@ -898,9 +897,9 @@ static class ShardMovementPriorityComparator implements Comparator @Override public int compare(ShardRouting lhs, ShardRouting rhs) { - assert Objects.equals(nodeId, lhs.currentNodeId()) && Objects.equals(nodeId, rhs.currentNodeId()) + assert nodeId.equals(lhs.currentNodeId()) && nodeId.equals(rhs.currentNodeId()) : this.getClass().getSimpleName() - + " is node-specific. Comparator node ID=" + + " is node-specific. comparator=" + nodeId + ", lhs=" + lhs.currentNodeId() @@ -910,9 +909,9 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { if (maxWriteLoadOnNode == MISSING_WRITE_LOAD) { return 0; } - // Otherwise, we prefer middle, high, then low write-load shards - double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), -1.0); - double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), -1.0); + + double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); + double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { final var lhsOverThreshold = lhsWriteLoad >= threshold; From 36cff36d3b55ff39ac08d1395bbe682c8ee70121 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 15:02:04 +1000 Subject: [PATCH 15/63] Simplify logic --- .../allocator/BalancedShardsAllocator.java | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index ac53641df3b52..d42263b985425 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -913,6 +913,13 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); + // prefer any known write-load over any unknown write-load + final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; + final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; + if (rhsIsMissing ^ lhsIsMissing) { + return lhsIsMissing ? -1 : 1; + } + if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { final var lhsOverThreshold = lhsWriteLoad >= threshold; final var rhsOverThreshold = rhsWriteLoad >= threshold; @@ -930,16 +937,8 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { return Double.compare(lhsWriteLoad, rhsWriteLoad); } - // at least one of the shards is the max-write-load shard - final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; - final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; - if (rhsIsMissing ^ lhsIsMissing) { - // prefer any known write-load over it - return lhsIsMissing ? -1 : 1; - } else { - // prefer the lowest (non-max) write load - return Double.compare(rhsWriteLoad, lhsWriteLoad); - } + // prefer the non-max write load if there is one + return rhsWriteLoad >= maxWriteLoadOnNode ? 1 : -1; } } From 98b5b80b01afc2fbd067f04189363ca567b3ec60 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 15:13:27 +1000 Subject: [PATCH 16/63] Tidy --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index d42263b985425..093ea206a90bf 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -938,7 +938,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { } // prefer the non-max write load if there is one - return rhsWriteLoad >= maxWriteLoadOnNode ? 1 : -1; + return Double.compare(rhsWriteLoad, lhsWriteLoad); } } From df79e6f5bbe962aea24e57408d1de5f1b955d10a Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 15:14:25 +1000 Subject: [PATCH 17/63] Pedantry --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 093ea206a90bf..5e9f805c465ad 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -910,8 +910,8 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { return 0; } - double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); - double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); + final double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); + final double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); // prefer any known write-load over any unknown write-load final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; From b2d67d7615f4d212b8b30e2cc2ac1b1137ae1f3a Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 15:42:39 +1000 Subject: [PATCH 18/63] Add test for NOT_PREFERRED movement --- .../BalancedShardsAllocatorTests.java | 120 ++++++++++++++---- 1 file changed, 95 insertions(+), 25 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index cd9dc48d2ecda..2a4293906373f 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -738,6 +738,20 @@ public void testReturnEarlyOnShardAssignmentChanges() { ) ); + // A started index with a non-preferred allocation + final IndexMetadata notPreferredAllocation = anIndex("large-not-preferred-allocation", indexSettings(IndexVersion.current(), 1, 0)) + .putInSyncAllocationIds(0, Set.of(UUIDs.randomBase64UUID())) + .build(); + projectMetadataBuilder.put(notPreferredAllocation, false); + routingTableBuilder.add( + IndexRoutingTable.builder(notPreferredAllocation.getIndex()) + .addShard( + shardRoutingBuilder(notPreferredAllocation.getIndex().getName(), 0, "large-1", true, ShardRoutingState.STARTED) + .withAllocationId(AllocationId.newInitializing(notPreferredAllocation.inSyncAllocationIds(0).iterator().next())) + .build() + ) + ); + // Indices with unbalanced weight of write loads final var numWriteLoadIndices = between(3, 5); for (int i = 0; i < numWriteLoadIndices; i++) { @@ -775,8 +789,12 @@ public void testReturnEarlyOnShardAssignmentChanges() { assertTrue("unexpected shard state: " + replicaShard, replicaShard.initializing()); // Undesired allocation is not moved because allocate call returns early - final var shard = routingTable.shardRoutingTable(undesiredAllocation.getIndex().getName(), 0).primaryShard(); - assertTrue("unexpected shard state: " + shard, shard.started()); + final var undesiredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); + assertTrue("unexpected shard state: " + undesiredShard, undesiredShard.started()); + + // Not-preferred shard is not moved because allocate call returns early + final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); + assertFalse("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); // Also no rebalancing for indices with unbalanced write loads due to returning early for (int i = 0; i < numWriteLoadIndices; i++) { @@ -790,8 +808,12 @@ public void testReturnEarlyOnShardAssignmentChanges() { { // Undesired allocation is now relocating final RoutingTable routingTable = clusterState.routingTable(ProjectId.DEFAULT); - final var shard = routingTable.shardRoutingTable(undesiredAllocation.getIndex().getName(), 0).primaryShard(); - assertTrue("unexpected shard state: " + shard, shard.relocating()); + final var undesiredShard = routingTable.shardRoutingTable(undesiredAllocation.getIndex().getName(), 0).primaryShard(); + assertTrue("unexpected shard state: " + undesiredShard, undesiredShard.relocating()); + + // Not-preferred shard is not moved because allocate call returns early + final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); + assertFalse("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); // Still no rebalancing for indices with unbalanced write loads due to returning early for (int i = 0; i < numWriteLoadIndices; i++) { @@ -802,6 +824,21 @@ public void testReturnEarlyOnShardAssignmentChanges() { // Third reroute clusterState = startInitializingShardsAndReroute(allocationService, clusterState); + { + // Not-preferred allocation is now relocating + final RoutingTable routingTable = clusterState.routingTable(ProjectId.DEFAULT); + final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); + assertTrue("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); + + // Still no rebalancing for indices with unbalanced write loads due to returning early + for (int i = 0; i < numWriteLoadIndices; i++) { + final var writeLoadShard = routingTable.shardRoutingTable("large-write-load-" + i, 0).primaryShard(); + assertTrue("unexpected shard state: " + writeLoadShard, writeLoadShard.started()); + } + } + + // Fourth reroute + clusterState = startInitializingShardsAndReroute(allocationService, clusterState); { // Rebalance should happen for one and only one of the indices with unbalanced write loads due to returning early final RoutingTable routingTable = clusterState.routingTable(ProjectId.DEFAULT); @@ -1102,32 +1139,65 @@ public BalancedShardsAllocator.NodeSorter sorterForShard(ShardRouting shard) { } /** - * Allocation deciders that only allow shards to be allocated to nodes whose names share the same prefix + * Allocation deciders that trigger movements based on specific index names + * + * @see MoveNotPreferredOnceDecider + * @see PrefixAllocationDecider + */ + private static AllocationDeciders prefixAllocationDeciders() { + return new AllocationDeciders( + List.of( + new PrefixAllocationDecider(), + new MoveNotPreferredOnceDecider(), + new SameShardAllocationDecider(ClusterSettings.createBuiltInClusterSettings()) + ) + ); + } + + /** + * Allocation decider that only allow shards to be allocated to nodes whose names share the same prefix * as the index they're from */ - private AllocationDeciders prefixAllocationDeciders() { - return new AllocationDeciders(List.of(new AllocationDecider() { - @Override - public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { - return nodePrefixMatchesIndexPrefix(shardRouting, node); - } + private static class PrefixAllocationDecider extends AllocationDecider { + @Override + public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return nodePrefixMatchesIndexPrefix(shardRouting, node); + } - @Override - public Decision canRemain( - IndexMetadata indexMetadata, - ShardRouting shardRouting, - RoutingNode node, - RoutingAllocation allocation - ) { - return nodePrefixMatchesIndexPrefix(shardRouting, node); - } + @Override + public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + return nodePrefixMatchesIndexPrefix(shardRouting, node); + } - private Decision nodePrefixMatchesIndexPrefix(ShardRouting shardRouting, RoutingNode node) { - var indexPrefix = prefix(shardRouting.index().getName()); - var nodePrefix = prefix(node.node().getId()); - return nodePrefix.equals(indexPrefix) ? Decision.YES : Decision.NO; + private Decision nodePrefixMatchesIndexPrefix(ShardRouting shardRouting, RoutingNode node) { + var indexPrefix = prefix(shardRouting.index().getName()); + var nodePrefix = prefix(node.node().getId()); + return nodePrefix.equals(indexPrefix) ? Decision.YES : Decision.NO; + } + } + + /** + * Returns {@link Decision#NOT_PREFERRED} for any shard with 'not-preferred' in the index + * name, until it's moved to another node. + */ + private static class MoveNotPreferredOnceDecider extends AllocationDecider { + + private Map originalNodes = new HashMap<>(); + + @Override + public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { + if (shardRouting.currentNodeId() == null) { + return Decision.YES; + } + if (shardRouting.index().getName().contains("not-preferred") == false) { + return Decision.YES; } - }, new SameShardAllocationDecider(ClusterSettings.createBuiltInClusterSettings()))); + if (originalNodes.containsKey(shardRouting.shardId()) == false) { + // Remember where we first saw it + originalNodes.put(shardRouting.shardId(), shardRouting.currentNodeId()); + } + return shardRouting.currentNodeId().equals(originalNodes.get(shardRouting.shardId())) ? Decision.NOT_PREFERRED : Decision.YES; + } } private static String prefix(String value) { From 3c5f5920b1e09e169cca53b9271249d29575473e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 15:48:10 +1000 Subject: [PATCH 19/63] Pedantry --- .../allocation/allocator/BalancedShardsAllocatorTests.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 2a4293906373f..73eee0d1c5d50 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -919,14 +919,14 @@ public void testShardMovementPriorityComparator() { for (int i = 0; i < numAtMax; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); - double currentWriteLoad = shardWriteLoads.get(currentShardId); + final double currentWriteLoad = shardWriteLoads.get(currentShardId); assertThat(currentWriteLoad, equalTo(maxWriteLoad)); } logger.info("--> expecting {} below low in ascending order", numBelowLow); for (int i = 0; i < numBelowLow; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); - double currentWriteLoad = shardWriteLoads.get(currentShardId); + final double currentWriteLoad = shardWriteLoads.get(currentShardId); if (i == 0) { lastWriteLoad = currentWriteLoad; } else { @@ -937,7 +937,7 @@ public void testShardMovementPriorityComparator() { for (int i = 0; i < numBetweenLowAndMax; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); - double currentWriteLoad = shardWriteLoads.get(currentShardId); + final double currentWriteLoad = shardWriteLoads.get(currentShardId); if (i == 0) { lastWriteLoad = currentWriteLoad; } else { From f204577adb724936b198386f1bcdbf70ded25771 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 16:17:18 +1000 Subject: [PATCH 20/63] Expand test to cover prioritisation --- .../BalancedShardsAllocatorTests.java | 78 ++++++++++++++++++- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 73eee0d1c5d50..523d18f6f46d0 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -688,11 +688,12 @@ public void testPartitionedClusterWithSeparateWeights() { } public void testReturnEarlyOnShardAssignmentChanges() { + var shardWriteLoads = new HashMap(); var allocationService = new MockAllocationService( prefixAllocationDeciders(), new TestGatewayAllocator(), new BalancedShardsAllocator(BalancerSettings.DEFAULT, TEST_WRITE_LOAD_FORECASTER), - EmptyClusterInfoService.INSTANCE, + () -> ClusterInfo.builder().shardWriteLoads(shardWriteLoads).build(), SNAPSHOT_INFO_SERVICE_WITH_NO_SHARD_SIZES ); @@ -738,7 +739,8 @@ public void testReturnEarlyOnShardAssignmentChanges() { ) ); - // A started index with a non-preferred allocation + // A started index with a non-preferred allocation (low write load) + final double notPreferredLowerWriteLoad = randomDoubleBetween(0.0, 5.0, true); final IndexMetadata notPreferredAllocation = anIndex("large-not-preferred-allocation", indexSettings(IndexVersion.current(), 1, 0)) .putInSyncAllocationIds(0, Set.of(UUIDs.randomBase64UUID())) .build(); @@ -751,6 +753,30 @@ public void testReturnEarlyOnShardAssignmentChanges() { .build() ) ); + shardWriteLoads.put(new ShardId(notPreferredAllocation.getIndex(), 0), notPreferredLowerWriteLoad); + + // A started index with a non-preferred allocation (max write load) + final double notPreferredMaxWriteLoad = randomDoubleBetween(notPreferredLowerWriteLoad, 15.0, true); + final IndexMetadata notPreferredAllocationMaxWriteLoad = anIndex( + "large-not-preferred-allocation-max-write-load", + indexSettings(IndexVersion.current(), 1, 0) + ).putInSyncAllocationIds(0, Set.of(UUIDs.randomBase64UUID())).build(); + projectMetadataBuilder.put(notPreferredAllocationMaxWriteLoad, false); + routingTableBuilder.add( + IndexRoutingTable.builder(notPreferredAllocationMaxWriteLoad.getIndex()) + .addShard( + shardRoutingBuilder( + notPreferredAllocationMaxWriteLoad.getIndex().getName(), + 0, + "large-1", + true, + ShardRoutingState.STARTED + ).withAllocationId( + AllocationId.newInitializing(notPreferredAllocationMaxWriteLoad.inSyncAllocationIds(0).iterator().next()) + ).build() + ) + ); + shardWriteLoads.put(new ShardId(notPreferredAllocationMaxWriteLoad.getIndex(), 0), notPreferredMaxWriteLoad); // Indices with unbalanced weight of write loads final var numWriteLoadIndices = between(3, 5); @@ -789,13 +815,23 @@ public void testReturnEarlyOnShardAssignmentChanges() { assertTrue("unexpected shard state: " + replicaShard, replicaShard.initializing()); // Undesired allocation is not moved because allocate call returns early - final var undesiredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); + final var undesiredShard = routingTable.shardRoutingTable(undesiredAllocation.getIndex().getName(), 0).primaryShard(); assertTrue("unexpected shard state: " + undesiredShard, undesiredShard.started()); // Not-preferred shard is not moved because allocate call returns early final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); assertFalse("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); + // Not-preferred (max-write-load) shard is not moved because allocate call returns early + final var notPreferredAllocationMaxWriteLoadShard = routingTable.shardRoutingTable( + notPreferredAllocationMaxWriteLoad.getIndex().getName(), + 0 + ).primaryShard(); + assertFalse( + "unexpected shard state: " + notPreferredAllocationMaxWriteLoadShard, + notPreferredAllocationMaxWriteLoadShard.relocating() + ); + // Also no rebalancing for indices with unbalanced write loads due to returning early for (int i = 0; i < numWriteLoadIndices; i++) { final var writeLoadShard = routingTable.shardRoutingTable("large-write-load-" + i, 0).primaryShard(); @@ -815,6 +851,16 @@ public void testReturnEarlyOnShardAssignmentChanges() { final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); assertFalse("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); + // Not-preferred (max-write-load) shard is not moved because allocate call returns early + final var notPreferredAllocationMaxWriteLoadShard = routingTable.shardRoutingTable( + notPreferredAllocationMaxWriteLoad.getIndex().getName(), + 0 + ).primaryShard(); + assertFalse( + "unexpected shard state: " + notPreferredAllocationMaxWriteLoadShard, + notPreferredAllocationMaxWriteLoadShard.relocating() + ); + // Still no rebalancing for indices with unbalanced write loads due to returning early for (int i = 0; i < numWriteLoadIndices; i++) { final var writeLoadShard = routingTable.shardRoutingTable("large-write-load-" + i, 0).primaryShard(); @@ -830,6 +876,16 @@ public void testReturnEarlyOnShardAssignmentChanges() { final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocation.getIndex().getName(), 0).primaryShard(); assertTrue("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); + // Not-preferred (max-write-load) shard is not moved because allocate call returns early + final var notPreferredAllocationMaxWriteLoadShard = routingTable.shardRoutingTable( + notPreferredAllocationMaxWriteLoad.getIndex().getName(), + 0 + ).primaryShard(); + assertFalse( + "unexpected shard state: " + notPreferredAllocationMaxWriteLoadShard, + notPreferredAllocationMaxWriteLoadShard.relocating() + ); + // Still no rebalancing for indices with unbalanced write loads due to returning early for (int i = 0; i < numWriteLoadIndices; i++) { final var writeLoadShard = routingTable.shardRoutingTable("large-write-load-" + i, 0).primaryShard(); @@ -839,6 +895,22 @@ public void testReturnEarlyOnShardAssignmentChanges() { // Fourth reroute clusterState = startInitializingShardsAndReroute(allocationService, clusterState); + { + // Not-preferred (max-write-load) allocation is now relocating + final RoutingTable routingTable = clusterState.routingTable(ProjectId.DEFAULT); + final var notPreferredShard = routingTable.shardRoutingTable(notPreferredAllocationMaxWriteLoad.getIndex().getName(), 0) + .primaryShard(); + assertTrue("unexpected shard state: " + notPreferredShard, notPreferredShard.relocating()); + + // Still no rebalancing for indices with unbalanced write loads due to returning early + for (int i = 0; i < numWriteLoadIndices; i++) { + final var writeLoadShard = routingTable.shardRoutingTable("large-write-load-" + i, 0).primaryShard(); + assertTrue("unexpected shard state: " + writeLoadShard, writeLoadShard.started()); + } + } + + // Fifth reroute + clusterState = startInitializingShardsAndReroute(allocationService, clusterState); { // Rebalance should happen for one and only one of the indices with unbalanced write loads due to returning early final RoutingTable routingTable = clusterState.routingTable(ProjectId.DEFAULT); From 3c65dc8c59e164797e50a254b26d2752a094fd3e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 16:23:52 +1000 Subject: [PATCH 21/63] Pedantry --- .../allocation/allocator/BalancedShardsAllocator.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 5e9f805c465ad..8948b766fdbfa 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1000,10 +1000,10 @@ private MoveDecision decideMove( * This is not guaranteed to be balanced after this operation we still try best effort to * allocate on the minimal eligible node. */ - BiFunction decider = canRemain.type() == Type.NOT_PREFERRED + final BiFunction decider = canRemain.type() == Type.NOT_PREFERRED ? this::decideCanAllocatePreferredOnly : this::decideCanAllocate; - MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, decider); + final MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, decider); if (moveDecision.getCanRemainDecision().type() == Type.NO && moveDecision.canRemain() == false && moveDecision.forceMove() == false) { @@ -1058,7 +1058,7 @@ private MoveDecision decideMove( } private Decision decideCanAllocatePreferredOnly(ShardRouting shardRouting, RoutingNode target) { - Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation); + final Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation); // not-preferred means no here if (decision.type() == Type.NOT_PREFERRED) { return Decision.NO; From 3fd27b556617dbe4d5dfe9dba539b6c0c23fc3b5 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 16:26:47 +1000 Subject: [PATCH 22/63] Pedantry --- .../allocation/allocator/BalancedShardsAllocator.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 8948b766fdbfa..681bf7990e6c2 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -849,7 +849,7 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); sourceNode.removeShard(index, shardRouting); - Tuple relocatingShards = routingNodes.relocateShard( + final Tuple relocatingShards = routingNodes.relocateShard( shardRouting, targetNode.getNodeId(), allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE), @@ -905,6 +905,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { + lhs.currentNodeId() + ", rhs=" + rhs.currentNodeId(); + // If we have no shard write-load data, shortcut if (maxWriteLoadOnNode == MISSING_WRITE_LOAD) { return 0; @@ -954,7 +955,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { * 4. If the method is invoked in explain mode (e.g. from the cluster allocation explain APIs), then * {@link MoveDecision#getNodeDecisions} will have a non-null value. */ - public MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting) { + public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shardRouting) { // This is when we're not calling in a loop, so we don't need to keep track of "best non-preferred" shards return decideMove(index, shardRouting, Map.of(), Map.of()); } @@ -981,7 +982,7 @@ private MoveDecision decideMove( return MoveDecision.remain(canRemain); } - // Investigate whether it's potentially a better move to make than the one we've discovered already + // Investigate whether it's a better move to make than one we've discovered already if (canRemain.type() == Type.NOT_PREFERRED && bestNonPreferredShardsByNode.containsKey(shardRouting.currentNodeId())) { int compare = comparatorCache.computeIfAbsent( shardRouting.currentNodeId(), From 063995cd7e9cfad18b230b62e399fd4cbdbbacaf Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 16:43:59 +1000 Subject: [PATCH 23/63] Reduce logging --- .../allocation/allocator/BalancedShardsAllocatorTests.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 523d18f6f46d0..5c7ee6144f5bd 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -964,7 +964,6 @@ public void testShardMovementPriorityComparator() { // Assign all shards to node final var allocatedRoutingNodes = allocation.routingNodes().mutableCopy(); for (ShardRouting shardRouting : allocatedRoutingNodes.unassigned()) { - logger.info("--> allocating shard [{}] to node [{}]", shardRouting, nodeId); allocatedRoutingNodes.initializeShard(shardRouting, nodeId, null, randomNonNegativeLong(), NOOP); } From 3815293086b177205f342295f0442fb791ba06d8 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 16:47:35 +1000 Subject: [PATCH 24/63] Naming --- .../allocator/BalancedShardsAllocatorTests.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 5c7ee6144f5bd..0a597c3bbe526 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -948,9 +948,14 @@ public void testShardMovementPriorityComparator() { final var allShards = stateWithIndices.routingTable(ProjectId.DEFAULT).allShards().collect(toSet()); final var shardWriteLoads = new HashMap(); - addRandomWriteLoads(shardWriteLoads, allShards, numAtMax, () -> maxWriteLoad); - addRandomWriteLoads(shardWriteLoads, allShards, numBetweenLowAndMax, () -> randomDoubleBetween(lowThreshold, maxWriteLoad, true)); - addRandomWriteLoads(shardWriteLoads, allShards, numBelowLow, () -> randomDoubleBetween(0, lowThreshold, true)); + addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numAtMax, () -> maxWriteLoad); + addRandomWriteLoadAndRemoveShard( + shardWriteLoads, + allShards, + numBetweenLowAndMax, + () -> randomDoubleBetween(lowThreshold, maxWriteLoad, true) + ); + addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numBelowLow, () -> randomDoubleBetween(0, lowThreshold, true)); assertThat(allShards, hasSize(numMissing)); final var allocation = new RoutingAllocation( @@ -1017,7 +1022,7 @@ public void testShardMovementPriorityComparator() { } } - private void addRandomWriteLoads( + private void addRandomWriteLoadAndRemoveShard( Map shardWriteLoads, Set shards, int count, From 34a88dfc420f5c444796713104ac6dcde635cc6c Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 17:00:23 +1000 Subject: [PATCH 25/63] Constant --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 681bf7990e6c2..f698831dc4b87 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -875,6 +875,7 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci // Visible for testing static class ShardMovementPriorityComparator implements Comparator { + private static final double THRESHOLD_RATIO = 0.5; private static final double MISSING_WRITE_LOAD = -1; private final Map shardWriteLoads; private final double maxWriteLoadOnNode; @@ -891,7 +892,7 @@ static class ShardMovementPriorityComparator implements Comparator ); } this.maxWriteLoadOnNode = maxWriteLoadOnNode; - threshold = maxWriteLoadOnNode * 0.5; + threshold = maxWriteLoadOnNode * THRESHOLD_RATIO; nodeId = routingNode.nodeId(); } From cb8ac9abe4f0db0476dc78857eebcea0cb692b01 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 22 Sep 2025 17:01:49 +1000 Subject: [PATCH 26/63] Fix javadoc --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index f698831dc4b87..f46b8dcfc2270 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -951,7 +951,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { * 2. If the shard is allowed to remain on its current node, no attempt will be made to move the shard and * {@link MoveDecision#getCanRemainDecision} will have a decision type of YES. All other fields in the object will be null. * 3. If the shard is not allowed to remain on its current node, then {@link MoveDecision#getAllocationDecision()} will be - * populated with the decision of moving to another node. If {@link MoveDecision#forceMove()} ()} returns {@code true}, then + * populated with the decision of moving to another node. If {@link MoveDecision#forceMove()} returns {@code true}, then * {@link MoveDecision#getTargetNode} will return a non-null value, otherwise the assignedNodeId will be null. * 4. If the method is invoked in explain mode (e.g. from the cluster allocation explain APIs), then * {@link MoveDecision#getNodeDecisions} will have a non-null value. From 1c4dfaf76d049b9287a89063cfb9e9f52395faba Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 23 Sep 2025 14:25:26 +1000 Subject: [PATCH 27/63] Used cached MoveDecision if no other moves have been made --- .../allocator/BalancedShardsAllocator.java | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index f46b8dcfc2270..0f89941ac8d60 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -805,7 +805,7 @@ public boolean moveShards() { // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are // offloading the shards. - final Map bestNonPreferredShardsByNode = new HashMap<>(); + final Map bestNonPreferredShardsByNode = new HashMap<>(); final Map comparatorCache = new HashMap<>(); for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { ShardRouting shardRouting = it.next(); @@ -814,7 +814,10 @@ public boolean moveShards() { if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { // Defer moving of not-preferred until we've moved the NOs if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { - bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), shardRouting); + bestNonPreferredShardsByNode.put( + shardRouting.currentNodeId(), + new MoveNotPreferredDecision(shardRouting, moveDecision) + ); } else { executeMove(shardRouting, index, moveDecision, "move"); shardMoved = true; @@ -829,12 +832,15 @@ public boolean moveShards() { // If we get here, attempt to move one of the best not-preferred shards that we identified earlier for (var entry : bestNonPreferredShardsByNode.entrySet()) { - final var shardRouting = entry.getValue(); + final var cachedDecision = entry.getValue(); + final var shardRouting = cachedDecision.shardRouting(); final var index = projectIndex(shardRouting); - // Have to re-check move decision in case we made a move that invalidated our existing assessment - final MoveDecision moveDecision = decideMove(index, shardRouting); + // If `shardMoved` is true, there may have been moves that have made our previous move decision + // invalid, so we must call `decideMove` again. If not, we know we haven't made any moves, and we + // can use the cached decision. + final var moveDecision = shardMoved ? decideMove(index, shardRouting) : cachedDecision.moveDecision(); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { - executeMove(entry.getValue(), index, moveDecision, "move-non-preferred"); + executeMove(shardRouting, index, moveDecision, "move-non-preferred"); // We only ever move a single non-preferred shard at a time return true; } else { @@ -845,6 +851,8 @@ public boolean moveShards() { return shardMoved; } + private record MoveNotPreferredDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} + private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision, String reason) { final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); @@ -964,7 +972,7 @@ public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shar private MoveDecision decideMove( ProjectIndex index, ShardRouting shardRouting, - Map bestNonPreferredShardsByNode, + Map bestNonPreferredShardsByNode, Map comparatorCache ) { NodeSorter sorter = nodeSorters.sorterForShard(shardRouting); @@ -988,7 +996,7 @@ private MoveDecision decideMove( int compare = comparatorCache.computeIfAbsent( shardRouting.currentNodeId(), nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) - ).compare(shardRouting, bestNonPreferredShardsByNode.get(shardRouting.currentNodeId())); + ).compare(shardRouting, bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()).shardRouting()); if (compare <= 0) { // Ignore inferior non-preferred moves return MoveDecision.NOT_TAKEN; From 738875dfed71a9223562dd659c1595aa03f7d1a7 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 12:52:19 +1000 Subject: [PATCH 28/63] Abstract best move tracking and comparison out of decideMove --- .../allocator/BalancedShardsAllocator.java | 95 ++++++++++++------- 1 file changed, 62 insertions(+), 33 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 0f89941ac8d60..26dd674471f33 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -54,6 +54,7 @@ import java.util.Map; import java.util.Set; import java.util.function.BiFunction; +import java.util.function.Predicate; import static org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata.Type.REPLACE; import static org.elasticsearch.cluster.routing.ExpectedShardSizeEstimator.getExpectedShardSize; @@ -805,19 +806,15 @@ public boolean moveShards() { // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are // offloading the shards. - final Map bestNonPreferredShardsByNode = new HashMap<>(); - final Map comparatorCache = new HashMap<>(); + final MostDesirableMovementsTracker movementsTracker = new MostDesirableMovementsTracker(); for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { - ShardRouting shardRouting = it.next(); - ProjectIndex index = projectIndex(shardRouting); - final MoveDecision moveDecision = decideMove(index, shardRouting, bestNonPreferredShardsByNode, comparatorCache); + final ShardRouting shardRouting = it.next(); + final ProjectIndex index = projectIndex(shardRouting); + final MoveDecision moveDecision = decideMove(index, shardRouting, movementsTracker); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { // Defer moving of not-preferred until we've moved the NOs if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { - bestNonPreferredShardsByNode.put( - shardRouting.currentNodeId(), - new MoveNotPreferredDecision(shardRouting, moveDecision) - ); + movementsTracker.putCurrentMoveDecision(shardRouting, moveDecision); } else { executeMove(shardRouting, index, moveDecision, "move"); shardMoved = true; @@ -831,14 +828,13 @@ public boolean moveShards() { } // If we get here, attempt to move one of the best not-preferred shards that we identified earlier - for (var entry : bestNonPreferredShardsByNode.entrySet()) { - final var cachedDecision = entry.getValue(); - final var shardRouting = cachedDecision.shardRouting(); + for (var preferredMove : movementsTracker.getPreferredShardMovements()) { + final var shardRouting = preferredMove.shardRouting(); final var index = projectIndex(shardRouting); // If `shardMoved` is true, there may have been moves that have made our previous move decision // invalid, so we must call `decideMove` again. If not, we know we haven't made any moves, and we // can use the cached decision. - final var moveDecision = shardMoved ? decideMove(index, shardRouting) : cachedDecision.moveDecision(); + final var moveDecision = shardMoved ? decideMove(index, shardRouting) : preferredMove.moveDecision(); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { executeMove(shardRouting, index, moveDecision, "move-non-preferred"); // We only ever move a single non-preferred shard at a time @@ -851,6 +847,38 @@ public boolean moveShards() { return shardMoved; } + /** + * Stores the most desirable shard seen so far and compares proposed shards against it using + * the {@link ShardMovementPriorityComparator}. + */ + private class MostDesirableMovementsTracker implements Predicate { + + private final Map bestNonPreferredShardsByNode = new HashMap<>(); + private final Map comparatorCache = new HashMap<>(); + + @Override + public boolean test(ShardRouting shardRouting) { + final var currentShardForNode = bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()); + if (currentShardForNode == null) { + return true; + } + int comparison = comparatorCache.computeIfAbsent( + shardRouting.currentNodeId(), + nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) + ).compare(shardRouting, currentShardForNode.shardRouting()); + // Ignore inferior non-preferred moves + return comparison > 0; + } + + public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { + bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new MoveNotPreferredDecision(shardRouting, moveDecision)); + } + + public Iterable getPreferredShardMovements() { + return bestNonPreferredShardsByNode.values(); + } + } + private record MoveNotPreferredDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision, String reason) { @@ -952,6 +980,19 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { } } + /** + * Decide whether to move a started shard to another node. + * Unconditional version of {@link #decideMove(ProjectIndex, ShardRouting, Predicate)} + * + * @param index The index that the shard being considered belongs to + * @param shardRouting The shard routing being considered for movement + * @return The {@link MoveDecision} for the shard + */ + public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shardRouting) { + // Always assess options for non-preferred allocations + return decideMove(index, shardRouting, ignored -> true); + } + /** * Makes a decision on whether to move a started shard to another node. The following rules apply * to the {@link MoveDecision} return object: @@ -963,18 +1004,13 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { * {@link MoveDecision#getTargetNode} will return a non-null value, otherwise the assignedNodeId will be null. * 4. If the method is invoked in explain mode (e.g. from the cluster allocation explain APIs), then * {@link MoveDecision#getNodeDecisions} will have a non-null value. + * + * @param index The index that the shard being considered belongs to + * @param shardRouting The shard routing being considered for movement + * @param nonPreferredPredicate A predicate used to determine whether to assess move options for shards in non-preferred allocations + * @return The {@link MoveDecision} for the shard */ - public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shardRouting) { - // This is when we're not calling in a loop, so we don't need to keep track of "best non-preferred" shards - return decideMove(index, shardRouting, Map.of(), Map.of()); - } - - private MoveDecision decideMove( - ProjectIndex index, - ShardRouting shardRouting, - Map bestNonPreferredShardsByNode, - Map comparatorCache - ) { + private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, Predicate nonPreferredPredicate) { NodeSorter sorter = nodeSorters.sorterForShard(shardRouting); index.assertMatch(shardRouting); @@ -992,15 +1028,8 @@ private MoveDecision decideMove( } // Investigate whether it's a better move to make than one we've discovered already - if (canRemain.type() == Type.NOT_PREFERRED && bestNonPreferredShardsByNode.containsKey(shardRouting.currentNodeId())) { - int compare = comparatorCache.computeIfAbsent( - shardRouting.currentNodeId(), - nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) - ).compare(shardRouting, bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()).shardRouting()); - if (compare <= 0) { - // Ignore inferior non-preferred moves - return MoveDecision.NOT_TAKEN; - } + if (canRemain.type() == Type.NOT_PREFERRED && nonPreferredPredicate.test(shardRouting) == false) { + return MoveDecision.NOT_TAKEN; } sorter.reset(index); From f1061e26726a75677719343f011ee011fec3d5c7 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 13:56:57 +1000 Subject: [PATCH 29/63] Fix comment --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 26dd674471f33..439403c71466d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1027,7 +1027,7 @@ private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, P return MoveDecision.remain(canRemain); } - // Investigate whether it's a better move to make than one we've discovered already + // Check predicate to decide whether to assess movement options if (canRemain.type() == Type.NOT_PREFERRED && nonPreferredPredicate.test(shardRouting) == false) { return MoveDecision.NOT_TAKEN; } From 7b423768cf19b82e33b588351bb9044b99e5b3c2 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 14:01:32 +1000 Subject: [PATCH 30/63] Move MostDesirableMovementsTracker and ShardMovementPriorityComparator below decideMove --- .../allocator/BalancedShardsAllocator.java | 226 +++++++++--------- 1 file changed, 113 insertions(+), 113 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 439403c71466d..8770aa18a8586 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -847,38 +847,6 @@ public boolean moveShards() { return shardMoved; } - /** - * Stores the most desirable shard seen so far and compares proposed shards against it using - * the {@link ShardMovementPriorityComparator}. - */ - private class MostDesirableMovementsTracker implements Predicate { - - private final Map bestNonPreferredShardsByNode = new HashMap<>(); - private final Map comparatorCache = new HashMap<>(); - - @Override - public boolean test(ShardRouting shardRouting) { - final var currentShardForNode = bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()); - if (currentShardForNode == null) { - return true; - } - int comparison = comparatorCache.computeIfAbsent( - shardRouting.currentNodeId(), - nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) - ).compare(shardRouting, currentShardForNode.shardRouting()); - // Ignore inferior non-preferred moves - return comparison > 0; - } - - public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { - bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new MoveNotPreferredDecision(shardRouting, moveDecision)); - } - - public Iterable getPreferredShardMovements() { - return bestNonPreferredShardsByNode.values(); - } - } - private record MoveNotPreferredDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision, String reason) { @@ -899,87 +867,6 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci } } - /** - * Sorts shards by desirability to move, ranking goes (in descending priority order) - *
    - *
  1. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  2. - *
  3. Shards with write-load in {threshold} -> 0
  4. - *
  5. Shards with maximum write-load
  6. - *
  7. Shards with missing write-load
  8. - *
- */ - // Visible for testing - static class ShardMovementPriorityComparator implements Comparator { - - private static final double THRESHOLD_RATIO = 0.5; - private static final double MISSING_WRITE_LOAD = -1; - private final Map shardWriteLoads; - private final double maxWriteLoadOnNode; - private final double threshold; - private final String nodeId; - - ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { - shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); - double maxWriteLoadOnNode = MISSING_WRITE_LOAD; - for (ShardRouting shardRouting : routingNode) { - maxWriteLoadOnNode = Math.max( - maxWriteLoadOnNode, - shardWriteLoads.getOrDefault(shardRouting.shardId(), MISSING_WRITE_LOAD) - ); - } - this.maxWriteLoadOnNode = maxWriteLoadOnNode; - threshold = maxWriteLoadOnNode * THRESHOLD_RATIO; - nodeId = routingNode.nodeId(); - } - - @Override - public int compare(ShardRouting lhs, ShardRouting rhs) { - assert nodeId.equals(lhs.currentNodeId()) && nodeId.equals(rhs.currentNodeId()) - : this.getClass().getSimpleName() - + " is node-specific. comparator=" - + nodeId - + ", lhs=" - + lhs.currentNodeId() - + ", rhs=" - + rhs.currentNodeId(); - - // If we have no shard write-load data, shortcut - if (maxWriteLoadOnNode == MISSING_WRITE_LOAD) { - return 0; - } - - final double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); - final double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); - - // prefer any known write-load over any unknown write-load - final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; - final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; - if (rhsIsMissing ^ lhsIsMissing) { - return lhsIsMissing ? -1 : 1; - } - - if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { - final var lhsOverThreshold = lhsWriteLoad >= threshold; - final var rhsOverThreshold = rhsWriteLoad >= threshold; - if (lhsOverThreshold && rhsOverThreshold) { - // Both values between threshold and maximum, prefer lowest - return Double.compare(rhsWriteLoad, lhsWriteLoad); - } else if (lhsOverThreshold) { - // lhs between threshold and maximum, rhs below threshold, prefer lhs - return 1; - } else if (rhsOverThreshold) { - // lhs below threshold, rhs between threshold and maximum, prefer rhs - return -1; - } - // Both values below the threshold, prefer highest - return Double.compare(lhsWriteLoad, rhsWriteLoad); - } - - // prefer the non-max write load if there is one - return Double.compare(rhsWriteLoad, lhsWriteLoad); - } - } - /** * Decide whether to move a started shard to another node. * Unconditional version of {@link #decideMove(ProjectIndex, ShardRouting, Predicate)} @@ -1096,6 +983,119 @@ private MoveDecision decideMove( ); } + /** + * Stores the most desirable shard seen so far and compares proposed shards against it using + * the {@link ShardMovementPriorityComparator}. + */ + private class MostDesirableMovementsTracker implements Predicate { + + private final Map bestNonPreferredShardsByNode = new HashMap<>(); + private final Map comparatorCache = new HashMap<>(); + + @Override + public boolean test(ShardRouting shardRouting) { + final var currentShardForNode = bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()); + if (currentShardForNode == null) { + return true; + } + int comparison = comparatorCache.computeIfAbsent( + shardRouting.currentNodeId(), + nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) + ).compare(shardRouting, currentShardForNode.shardRouting()); + // Ignore inferior non-preferred moves + return comparison > 0; + } + + public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { + bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new MoveNotPreferredDecision(shardRouting, moveDecision)); + } + + public Iterable getPreferredShardMovements() { + return bestNonPreferredShardsByNode.values(); + } + } + + /** + * Sorts shards by desirability to move, ranking goes (in descending priority order) + *
    + *
  1. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  2. + *
  3. Shards with write-load in {threshold} -> 0
  4. + *
  5. Shards with maximum write-load
  6. + *
  7. Shards with missing write-load
  8. + *
+ */ + // Visible for testing + static class ShardMovementPriorityComparator implements Comparator { + + private static final double THRESHOLD_RATIO = 0.5; + private static final double MISSING_WRITE_LOAD = -1; + private final Map shardWriteLoads; + private final double maxWriteLoadOnNode; + private final double threshold; + private final String nodeId; + + ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { + shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); + double maxWriteLoadOnNode = MISSING_WRITE_LOAD; + for (ShardRouting shardRouting : routingNode) { + maxWriteLoadOnNode = Math.max( + maxWriteLoadOnNode, + shardWriteLoads.getOrDefault(shardRouting.shardId(), MISSING_WRITE_LOAD) + ); + } + this.maxWriteLoadOnNode = maxWriteLoadOnNode; + threshold = maxWriteLoadOnNode * THRESHOLD_RATIO; + nodeId = routingNode.nodeId(); + } + + @Override + public int compare(ShardRouting lhs, ShardRouting rhs) { + assert nodeId.equals(lhs.currentNodeId()) && nodeId.equals(rhs.currentNodeId()) + : this.getClass().getSimpleName() + + " is node-specific. comparator=" + + nodeId + + ", lhs=" + + lhs.currentNodeId() + + ", rhs=" + + rhs.currentNodeId(); + + // If we have no shard write-load data, shortcut + if (maxWriteLoadOnNode == MISSING_WRITE_LOAD) { + return 0; + } + + final double lhsWriteLoad = shardWriteLoads.getOrDefault(lhs.shardId(), MISSING_WRITE_LOAD); + final double rhsWriteLoad = shardWriteLoads.getOrDefault(rhs.shardId(), MISSING_WRITE_LOAD); + + // prefer any known write-load over any unknown write-load + final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; + final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; + if (rhsIsMissing ^ lhsIsMissing) { + return lhsIsMissing ? -1 : 1; + } + + if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { + final var lhsOverThreshold = lhsWriteLoad >= threshold; + final var rhsOverThreshold = rhsWriteLoad >= threshold; + if (lhsOverThreshold && rhsOverThreshold) { + // Both values between threshold and maximum, prefer lowest + return Double.compare(rhsWriteLoad, lhsWriteLoad); + } else if (lhsOverThreshold) { + // lhs between threshold and maximum, rhs below threshold, prefer lhs + return 1; + } else if (rhsOverThreshold) { + // lhs below threshold, rhs between threshold and maximum, prefer rhs + return -1; + } + // Both values below the threshold, prefer highest + return Double.compare(lhsWriteLoad, rhsWriteLoad); + } + + // prefer the non-max write load if there is one + return Double.compare(rhsWriteLoad, lhsWriteLoad); + } + } + private Decision decideCanAllocatePreferredOnly(ShardRouting shardRouting, RoutingNode target) { final Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation); // not-preferred means no here From baf802d24a4b93abe5847a92a7626a4ebcbb357e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 14:04:51 +1000 Subject: [PATCH 31/63] Put explicit check in for rhsMissing and lhsMissing --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 8770aa18a8586..9cba811db02d0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1070,6 +1070,9 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { // prefer any known write-load over any unknown write-load final var rhsIsMissing = rhsWriteLoad == MISSING_WRITE_LOAD; final var lhsIsMissing = lhsWriteLoad == MISSING_WRITE_LOAD; + if (rhsIsMissing && lhsIsMissing) { + return 0; + } if (rhsIsMissing ^ lhsIsMissing) { return lhsIsMissing ? -1 : 1; } From 104be725ffff164f4c13c1b0c9353279c6a59ebe Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 14:09:59 +1000 Subject: [PATCH 32/63] Move shardMoved update below early exit --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 9cba811db02d0..85b37d0d447f4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -817,10 +817,10 @@ public boolean moveShards() { movementsTracker.putCurrentMoveDecision(shardRouting, moveDecision); } else { executeMove(shardRouting, index, moveDecision, "move"); - shardMoved = true; if (completeEarlyOnShardAssignmentChange) { return true; } + shardMoved = true; } } else if (moveDecision.isDecisionTaken() && moveDecision.canRemain() == false) { logger.trace("[{}][{}] can't move", shardRouting.index(), shardRouting.id()); From 1302b4647756776ba30aa1256ccdcb23fe09fedf Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 29 Sep 2025 04:17:39 +0000 Subject: [PATCH 33/63] [CI] Update transport version definitions --- server/src/main/resources/transport/upper_bounds/8.18.csv | 2 +- server/src/main/resources/transport/upper_bounds/8.19.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.0.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.1.csv | 2 +- server/src/main/resources/transport/upper_bounds/9.2.csv | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/main/resources/transport/upper_bounds/8.18.csv b/server/src/main/resources/transport/upper_bounds/8.18.csv index ffc592e1809ee..266bfbbd3bf78 100644 --- a/server/src/main/resources/transport/upper_bounds/8.18.csv +++ b/server/src/main/resources/transport/upper_bounds/8.18.csv @@ -1 +1 @@ -initial_elasticsearch_8_18_8,8840010 +transform_check_for_dangling_tasks,8840011 diff --git a/server/src/main/resources/transport/upper_bounds/8.19.csv b/server/src/main/resources/transport/upper_bounds/8.19.csv index 3cc6f439c5ea5..3600b3f8c633a 100644 --- a/server/src/main/resources/transport/upper_bounds/8.19.csv +++ b/server/src/main/resources/transport/upper_bounds/8.19.csv @@ -1 +1 @@ -initial_elasticsearch_8_19_5,8841069 +transform_check_for_dangling_tasks,8841070 diff --git a/server/src/main/resources/transport/upper_bounds/9.0.csv b/server/src/main/resources/transport/upper_bounds/9.0.csv index 8ad2ed1a4cacf..c11e6837bb813 100644 --- a/server/src/main/resources/transport/upper_bounds/9.0.csv +++ b/server/src/main/resources/transport/upper_bounds/9.0.csv @@ -1 +1 @@ -initial_elasticsearch_9_0_8,9000017 +transform_check_for_dangling_tasks,9000018 diff --git a/server/src/main/resources/transport/upper_bounds/9.1.csv b/server/src/main/resources/transport/upper_bounds/9.1.csv index 1cea5dc4d929b..80b97d85f7511 100644 --- a/server/src/main/resources/transport/upper_bounds/9.1.csv +++ b/server/src/main/resources/transport/upper_bounds/9.1.csv @@ -1 +1 @@ -initial_elasticsearch_9_1_5,9112008 +transform_check_for_dangling_tasks,9112009 diff --git a/server/src/main/resources/transport/upper_bounds/9.2.csv b/server/src/main/resources/transport/upper_bounds/9.2.csv index 6e7d51d3d3020..980b8e87c6329 100644 --- a/server/src/main/resources/transport/upper_bounds/9.2.csv +++ b/server/src/main/resources/transport/upper_bounds/9.2.csv @@ -1 +1 @@ -security_stats_endpoint,9168000 +extended_search_usage_telemetry,9177000 From 0055dc7808926d9a71bbd14c6f32a7ee58ad97c3 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 14:52:48 +1000 Subject: [PATCH 34/63] Clarify natural ordering in comparator --- .../allocation/allocator/BalancedShardsAllocator.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 85b37d0d447f4..176aa51ab1f67 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1018,11 +1018,18 @@ public Iterable getPreferredShardMovements() { /** * Sorts shards by desirability to move, ranking goes (in descending priority order) *
    - *
  1. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  2. - *
  3. Shards with write-load in {threshold} -> 0
  4. + *
  5. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  6. + *
  7. Shards with write-load in {threshold} -> 0
  8. *
  9. Shards with maximum write-load
  10. *
  11. Shards with missing write-load
  12. *
+ * + * e.g., for any two ShardRoutings, r1 and r2, + *
    + *
  • compare(r1, r2) > 0 when r1 is most desirable to move
  • + *
  • compare(r1, r2) == 0 when the two shards are equally desirable to move
  • + *
  • compare(r1, r2) < 0 when r2 is most desirable to move
  • + *
*/ // Visible for testing static class ShardMovementPriorityComparator implements Comparator { From a031158e285807e3e9ee6d6ed8eb87438125dd62 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 15:04:35 +1000 Subject: [PATCH 35/63] Escape > and < --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 176aa51ab1f67..4be389598ddd6 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1026,9 +1026,9 @@ public Iterable getPreferredShardMovements() { * * e.g., for any two ShardRoutings, r1 and r2, *
    - *
  • compare(r1, r2) > 0 when r1 is most desirable to move
  • + *
  • compare(r1, r2) > 0 when r1 is most desirable to move
  • *
  • compare(r1, r2) == 0 when the two shards are equally desirable to move
  • - *
  • compare(r1, r2) < 0 when r2 is most desirable to move
  • + *
  • compare(r1, r2) < 0 when r2 is most desirable to move
  • *
*/ // Visible for testing From ede9e449eadfb11501d91fa62f8fe9d8ef9c4ed1 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 16:51:43 +1000 Subject: [PATCH 36/63] Move MoveNotPreferredDecision local to where it's used --- .../allocation/allocator/BalancedShardsAllocator.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 1f47f2452f368..24c822867e843 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -850,8 +850,6 @@ public boolean moveShards() { return shardMoved; } - private record MoveNotPreferredDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} - private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDecision moveDecision, String reason) { final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId()); final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId()); @@ -1000,7 +998,9 @@ private MoveDecision decideMove( */ private class MostDesirableMovementsTracker implements Predicate { - private final Map bestNonPreferredShardsByNode = new HashMap<>(); + public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} + + private final Map bestNonPreferredShardsByNode = new HashMap<>(); private final Map comparatorCache = new HashMap<>(); @Override @@ -1018,10 +1018,10 @@ public boolean test(ShardRouting shardRouting) { } public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { - bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new MoveNotPreferredDecision(shardRouting, moveDecision)); + bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new StoredMoveDecision(shardRouting, moveDecision)); } - public Iterable getPreferredShardMovements() { + public Iterable getPreferredShardMovements() { return bestNonPreferredShardsByNode.values(); } } From 2c87b39350df8e6a89ce1acf8323794b8fda9018 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 29 Sep 2025 16:55:27 +1000 Subject: [PATCH 37/63] Minimise change after merge --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 24c822867e843..3fbd3a0b98a87 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -911,7 +911,7 @@ private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, P assert sourceNode != null && sourceNode.containsShard(index, shardRouting); RoutingNode routingNode = sourceNode.getRoutingNode(); Decision canRemain = allocation.deciders().canRemain(shardRouting, routingNode, allocation); - if (canRemain.type() != Decision.Type.NO && canRemain.type() != Type.NOT_PREFERRED) { + if (canRemain.type() != Decision.Type.NO && canRemain.type() != Decision.Type.NOT_PREFERRED) { return MoveDecision.remain(canRemain); } From 7ef836d2a5bcc71bceb76da51a745de3ce2b229c Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 30 Sep 2025 16:24:05 +1000 Subject: [PATCH 38/63] Fix HTML, reference relevant fields --- .../allocation/allocator/BalancedShardsAllocator.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 3fbd3a0b98a87..dea3b7cf04afa 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1029,9 +1029,10 @@ public Iterable getPreferredShardMovements() { /** * Sorts shards by desirability to move, ranking goes (in descending priority order) *
    - *
  1. Shards with write-load in {threshold} -> maximum write-load (exclusive)
  2. - *
  3. Shards with write-load in {threshold} -> 0
  4. - *
  5. Shards with maximum write-load
  6. + *
  7. Shards with write-load in {@link ShardMovementPriorityComparator#threshold} → + * {@link ShardMovementPriorityComparator#maxWriteLoadOnNode} (exclusive)
  8. + *
  9. Shards with write-load in {@link ShardMovementPriorityComparator#threshold} → 0
  10. + *
  11. Shards with write-load == {@link ShardMovementPriorityComparator#maxWriteLoadOnNode}
  12. *
  13. Shards with missing write-load
  14. *
* From 4c6ebdb3e3a34c42ada6f70c59ba8aea47e1aa51 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 30 Sep 2025 16:31:27 +1000 Subject: [PATCH 39/63] Document THRESHOLD_RATIO --- .../allocation/allocator/BalancedShardsAllocator.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index dea3b7cf04afa..8acdb2d950ce4 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1046,6 +1046,12 @@ public Iterable getPreferredShardMovements() { // Visible for testing static class ShardMovementPriorityComparator implements Comparator { + /** + * This is the threshold over which we consider shards to have a "high" write load represented + * as a ratio of the maximum write-load present on the node. + *

+ * We prefer to move shards that have a write-load close to this value x {@link #maxWriteLoadOnNode}. + */ private static final double THRESHOLD_RATIO = 0.5; private static final double MISSING_WRITE_LOAD = -1; private final Map shardWriteLoads; From a53787cd4f7c634ab3d09b2368ad3892fe245a6e Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 30 Sep 2025 16:43:26 +1000 Subject: [PATCH 40/63] More specific naming on comparator, fix javadoc paragraph break --- .../allocator/BalancedShardsAllocator.java | 20 +++++++++---------- .../BalancedShardsAllocatorTests.java | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 8acdb2d950ce4..65e05b3bf3dfa 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -994,14 +994,14 @@ private MoveDecision decideMove( /** * Stores the most desirable shard seen so far and compares proposed shards against it using - * the {@link ShardMovementPriorityComparator}. + * the {@link PrioritiseByShardWriteLoadComparator}. */ private class MostDesirableMovementsTracker implements Predicate { public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} private final Map bestNonPreferredShardsByNode = new HashMap<>(); - private final Map comparatorCache = new HashMap<>(); + private final Map comparatorCache = new HashMap<>(); @Override public boolean test(ShardRouting shardRouting) { @@ -1011,7 +1011,7 @@ public boolean test(ShardRouting shardRouting) { } int comparison = comparatorCache.computeIfAbsent( shardRouting.currentNodeId(), - nodeId -> new ShardMovementPriorityComparator(allocation, allocation.routingNodes().node(nodeId)) + nodeId -> new PrioritiseByShardWriteLoadComparator(allocation, allocation.routingNodes().node(nodeId)) ).compare(shardRouting, currentShardForNode.shardRouting()); // Ignore inferior non-preferred moves return comparison > 0; @@ -1029,10 +1029,10 @@ public Iterable getPreferredShardMovements() { /** * Sorts shards by desirability to move, ranking goes (in descending priority order) *

    - *
  1. Shards with write-load in {@link ShardMovementPriorityComparator#threshold} → - * {@link ShardMovementPriorityComparator#maxWriteLoadOnNode} (exclusive)
  2. - *
  3. Shards with write-load in {@link ShardMovementPriorityComparator#threshold} → 0
  4. - *
  5. Shards with write-load == {@link ShardMovementPriorityComparator#maxWriteLoadOnNode}
  6. + *
  7. Shards with write-load in {@link PrioritiseByShardWriteLoadComparator#threshold} → + * {@link PrioritiseByShardWriteLoadComparator#maxWriteLoadOnNode} (exclusive)
  8. + *
  9. Shards with write-load in {@link PrioritiseByShardWriteLoadComparator#threshold} → 0
  10. + *
  11. Shards with write-load == {@link PrioritiseByShardWriteLoadComparator#maxWriteLoadOnNode}
  12. *
  13. Shards with missing write-load
  14. *
* @@ -1044,12 +1044,12 @@ public Iterable getPreferredShardMovements() { * */ // Visible for testing - static class ShardMovementPriorityComparator implements Comparator { + static class PrioritiseByShardWriteLoadComparator implements Comparator { /** * This is the threshold over which we consider shards to have a "high" write load represented * as a ratio of the maximum write-load present on the node. - *

+ *

* We prefer to move shards that have a write-load close to this value x {@link #maxWriteLoadOnNode}. */ private static final double THRESHOLD_RATIO = 0.5; @@ -1059,7 +1059,7 @@ static class ShardMovementPriorityComparator implements Comparator private final double threshold; private final String nodeId; - ShardMovementPriorityComparator(RoutingAllocation allocation, RoutingNode routingNode) { + PrioritiseByShardWriteLoadComparator(RoutingAllocation allocation, RoutingNode routingNode) { shardWriteLoads = allocation.clusterInfo().getShardWriteLoads(); double maxWriteLoadOnNode = MISSING_WRITE_LOAD; for (ShardRouting shardRouting : routingNode) { diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 0a597c3bbe526..059b12d557af6 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -972,7 +972,7 @@ public void testShardMovementPriorityComparator() { allocatedRoutingNodes.initializeShard(shardRouting, nodeId, null, randomNonNegativeLong(), NOOP); } - final var comparator = new BalancedShardsAllocator.Balancer.ShardMovementPriorityComparator( + final var comparator = new BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator( allocation, allocatedRoutingNodes.node(nodeId) ); From 7d7d23cecd3da31f6f87d16501e5c925bdd522a7 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 09:32:16 +1000 Subject: [PATCH 41/63] Make comparator put most-preferred first --- .../allocator/BalancedShardsAllocator.java | 20 +++++------ .../BalancedShardsAllocatorTests.java | 35 ++++++++++--------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 65e05b3bf3dfa..988d4c35a5750 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1014,7 +1014,7 @@ public boolean test(ShardRouting shardRouting) { nodeId -> new PrioritiseByShardWriteLoadComparator(allocation, allocation.routingNodes().node(nodeId)) ).compare(shardRouting, currentShardForNode.shardRouting()); // Ignore inferior non-preferred moves - return comparison > 0; + return comparison < 0; } public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { @@ -1027,7 +1027,7 @@ public Iterable getPreferredShardMovements() { } /** - * Sorts shards by desirability to move, ranking goes (in descending priority order) + * Sorts shards by desirability to move, sort order goes: *

    *
  1. Shards with write-load in {@link PrioritiseByShardWriteLoadComparator#threshold} → * {@link PrioritiseByShardWriteLoadComparator#maxWriteLoadOnNode} (exclusive)
  2. @@ -1038,9 +1038,9 @@ public Iterable getPreferredShardMovements() { * * e.g., for any two ShardRoutings, r1 and r2, *
      - *
    • compare(r1, r2) > 0 when r1 is most desirable to move
    • + *
    • compare(r1, r2) > 0 when r2 is more desirable to move
    • *
    • compare(r1, r2) == 0 when the two shards are equally desirable to move
    • - *
    • compare(r1, r2) < 0 when r2 is most desirable to move
    • + *
    • compare(r1, r2) < 0 when r1 is more desirable to move
    • *
    */ // Visible for testing @@ -1099,7 +1099,7 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { return 0; } if (rhsIsMissing ^ lhsIsMissing) { - return lhsIsMissing ? -1 : 1; + return lhsIsMissing ? 1 : -1; } if (lhsWriteLoad < maxWriteLoadOnNode && rhsWriteLoad < maxWriteLoadOnNode) { @@ -1107,20 +1107,20 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { final var rhsOverThreshold = rhsWriteLoad >= threshold; if (lhsOverThreshold && rhsOverThreshold) { // Both values between threshold and maximum, prefer lowest - return Double.compare(rhsWriteLoad, lhsWriteLoad); + return Double.compare(lhsWriteLoad, rhsWriteLoad); } else if (lhsOverThreshold) { // lhs between threshold and maximum, rhs below threshold, prefer lhs - return 1; + return -1; } else if (rhsOverThreshold) { // lhs below threshold, rhs between threshold and maximum, prefer rhs - return -1; + return 1; } // Both values below the threshold, prefer highest - return Double.compare(lhsWriteLoad, rhsWriteLoad); + return Double.compare(rhsWriteLoad, lhsWriteLoad); } // prefer the non-max write load if there is one - return Double.compare(rhsWriteLoad, lhsWriteLoad); + return Double.compare(lhsWriteLoad, rhsWriteLoad); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 059b12d557af6..7178a04bb0359 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -88,6 +88,8 @@ import static org.hamcrest.Matchers.lessThanOrEqualTo; import static org.hamcrest.Matchers.notNullValue; import static org.hamcrest.Matchers.sameInstance; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; public class BalancedShardsAllocatorTests extends ESAllocationTestCase { @@ -986,20 +988,9 @@ public void testShardMovementPriorityComparator() { double lastWriteLoad = 0.0; int currentIndex = 0; - logger.info("--> expecting {} missing", numMissing); - for (int i = 0; i < numMissing; i++) { - final var currentShardId = sortedShards.get(currentIndex++); - assertThat(shardWriteLoads, Matchers.not(Matchers.hasKey(currentShardId.shardId()))); - } - logger.info("--> expecting {} at max", numAtMax); - for (int i = 0; i < numAtMax; i++) { - final var currentShardId = sortedShards.get(currentIndex++).shardId(); - assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); - final double currentWriteLoad = shardWriteLoads.get(currentShardId); - assertThat(currentWriteLoad, equalTo(maxWriteLoad)); - } - logger.info("--> expecting {} below low in ascending order", numBelowLow); - for (int i = 0; i < numBelowLow; i++) { + + logger.info("--> expecting {} between low and max in ascending order", numBetweenLowAndMax); + for (int i = 0; i < numBetweenLowAndMax; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); @@ -1009,8 +1000,8 @@ public void testShardMovementPriorityComparator() { assertThat(currentWriteLoad, greaterThanOrEqualTo(lastWriteLoad)); } } - logger.info("--> expecting {} between low and max in descending order", numBetweenLowAndMax); - for (int i = 0; i < numBetweenLowAndMax; i++) { + logger.info("--> expecting {} below low in descending order", numBelowLow); + for (int i = 0; i < numBelowLow; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); @@ -1020,6 +1011,18 @@ public void testShardMovementPriorityComparator() { assertThat(currentWriteLoad, lessThanOrEqualTo(lastWriteLoad)); } } + logger.info("--> expecting {} at max", numAtMax); + for (int i = 0; i < numAtMax; i++) { + final var currentShardId = sortedShards.get(currentIndex++).shardId(); + assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); + final double currentWriteLoad = shardWriteLoads.get(currentShardId); + assertThat(currentWriteLoad, equalTo(maxWriteLoad)); + } + logger.info("--> expecting {} missing", numMissing); + for (int i = 0; i < numMissing; i++) { + final var currentShardId = sortedShards.get(currentIndex++); + assertThat(shardWriteLoads, Matchers.not(Matchers.hasKey(currentShardId.shardId()))); + } } private void addRandomWriteLoadAndRemoveShard( From 2199491775e0b7ab902f1a251091abb7f10bd024 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 09:37:22 +1000 Subject: [PATCH 42/63] Use LinkedHashMap for predictable iteration order --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 988d4c35a5750..a16e69c257850 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -50,6 +50,7 @@ import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -1000,7 +1001,8 @@ private class MostDesirableMovementsTracker implements Predicate { public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} - private final Map bestNonPreferredShardsByNode = new HashMap<>(); + // LinkedHashMap so we iterate in insertion order + private final Map bestNonPreferredShardsByNode = new LinkedHashMap<>(); private final Map comparatorCache = new HashMap<>(); @Override From ee95443ef84a2e90d569ee61fc5548e12e3caa65 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 09:41:49 +1000 Subject: [PATCH 43/63] Move tracker initialization above loop comment --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index a16e69c257850..02f22e0eb1765 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -807,10 +807,10 @@ protected int comparePivot(int j) { */ public boolean moveShards() { boolean shardMoved = false; + final MostDesirableMovementsTracker movementsTracker = new MostDesirableMovementsTracker(); // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are // offloading the shards. - final MostDesirableMovementsTracker movementsTracker = new MostDesirableMovementsTracker(); for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { final ShardRouting shardRouting = it.next(); final ProjectIndex index = projectIndex(shardRouting); From 8673351318de02689d3d332aa32b9204c853d56a Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 11:27:09 +1000 Subject: [PATCH 44/63] Make it clearer what the predicate is when looking for the best non-preferred shard to move --- .../allocator/BalancedShardsAllocator.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 02f22e0eb1765..8dcdc03cdae57 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -814,7 +814,7 @@ public boolean moveShards() { for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { final ShardRouting shardRouting = it.next(); final ProjectIndex index = projectIndex(shardRouting); - final MoveDecision moveDecision = decideMove(index, shardRouting, movementsTracker); + final MoveDecision moveDecision = decideMove(index, shardRouting, movementsTracker::shardIsPreferableToCurrent); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { // Defer moving of not-preferred until we've moved the NOs if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { @@ -997,7 +997,7 @@ private MoveDecision decideMove( * Stores the most desirable shard seen so far and compares proposed shards against it using * the {@link PrioritiseByShardWriteLoadComparator}. */ - private class MostDesirableMovementsTracker implements Predicate { + private class MostDesirableMovementsTracker { public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} @@ -1005,8 +1005,14 @@ public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDec private final Map bestNonPreferredShardsByNode = new LinkedHashMap<>(); private final Map comparatorCache = new HashMap<>(); - @Override - public boolean test(ShardRouting shardRouting) { + /** + * Is the provided {@link ShardRouting} potentially a better shard to move than the one + * we currently have stored for this node? + * + * @param shardRouting The shard routing being considered for movement + * @return true if the shard is more desirable to move that the current one we stored for this node, false otherwise. + */ + public boolean shardIsPreferableToCurrent(ShardRouting shardRouting) { final var currentShardForNode = bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()); if (currentShardForNode == null) { return true; From 1b87daa2331410ddbe52db0eb2180081e6205b31 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 12:21:07 +1000 Subject: [PATCH 45/63] Update server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java Co-authored-by: Dianna Hohensee --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 8dcdc03cdae57..01c7234c8357b 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -841,7 +841,7 @@ public boolean moveShards() { final var moveDecision = shardMoved ? decideMove(index, shardRouting) : preferredMove.moveDecision(); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { executeMove(shardRouting, index, moveDecision, "move-non-preferred"); - // We only ever move a single non-preferred shard at a time + // Return after a single move so that the change can be simulated before further moves are made. return true; } else { logger.trace("[{}][{}] can no longer move (not-preferred)", shardRouting.index(), shardRouting.id()); From 57979dc5f2469a4eee06602cb7999da8c4b522d2 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 12:31:16 +1000 Subject: [PATCH 46/63] Improve naming --- .../allocator/BalancedShardsAllocator.java | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 01c7234c8357b..6c00a5c8c202a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -807,18 +807,22 @@ protected int comparePivot(int j) { */ public boolean moveShards() { boolean shardMoved = false; - final MostDesirableMovementsTracker movementsTracker = new MostDesirableMovementsTracker(); + final BestShardMovementsTracker bestNonPreferredShardMovementsTracker = new BestShardMovementsTracker(); // Iterate over the started shards interleaving between nodes, and check if they can remain. In the presence of throttling // shard movements, the goal of this iteration order is to achieve a fairer movement of shards from the nodes that are // offloading the shards. for (Iterator it = allocation.routingNodes().nodeInterleavedShardIterator(); it.hasNext();) { final ShardRouting shardRouting = it.next(); final ProjectIndex index = projectIndex(shardRouting); - final MoveDecision moveDecision = decideMove(index, shardRouting, movementsTracker::shardIsPreferableToCurrent); + final MoveDecision moveDecision = decideMove( + index, + shardRouting, + bestNonPreferredShardMovementsTracker::shardIsBetterThanCurrent + ); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { // Defer moving of not-preferred until we've moved the NOs if (moveDecision.getCanRemainDecision().type() == Type.NOT_PREFERRED) { - movementsTracker.putCurrentMoveDecision(shardRouting, moveDecision); + bestNonPreferredShardMovementsTracker.putBestMoveDecision(shardRouting, moveDecision); } else { executeMove(shardRouting, index, moveDecision, "move"); if (completeEarlyOnShardAssignmentChange) { @@ -832,13 +836,13 @@ public boolean moveShards() { } // If we get here, attempt to move one of the best not-preferred shards that we identified earlier - for (var preferredMove : movementsTracker.getPreferredShardMovements()) { - final var shardRouting = preferredMove.shardRouting(); + for (var potentialMove : bestNonPreferredShardMovementsTracker.getBestShardMovements()) { + final var shardRouting = potentialMove.shardRouting(); final var index = projectIndex(shardRouting); // If `shardMoved` is true, there may have been moves that have made our previous move decision // invalid, so we must call `decideMove` again. If not, we know we haven't made any moves, and we // can use the cached decision. - final var moveDecision = shardMoved ? decideMove(index, shardRouting) : preferredMove.moveDecision(); + final var moveDecision = shardMoved ? decideMove(index, shardRouting) : potentialMove.moveDecision(); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { executeMove(shardRouting, index, moveDecision, "move-non-preferred"); // Return after a single move so that the change can be simulated before further moves are made. @@ -997,12 +1001,12 @@ private MoveDecision decideMove( * Stores the most desirable shard seen so far and compares proposed shards against it using * the {@link PrioritiseByShardWriteLoadComparator}. */ - private class MostDesirableMovementsTracker { + private class BestShardMovementsTracker { public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} // LinkedHashMap so we iterate in insertion order - private final Map bestNonPreferredShardsByNode = new LinkedHashMap<>(); + private final Map bestShardMovementsByNode = new LinkedHashMap<>(); private final Map comparatorCache = new HashMap<>(); /** @@ -1012,8 +1016,8 @@ public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDec * @param shardRouting The shard routing being considered for movement * @return true if the shard is more desirable to move that the current one we stored for this node, false otherwise. */ - public boolean shardIsPreferableToCurrent(ShardRouting shardRouting) { - final var currentShardForNode = bestNonPreferredShardsByNode.get(shardRouting.currentNodeId()); + public boolean shardIsBetterThanCurrent(ShardRouting shardRouting) { + final var currentShardForNode = bestShardMovementsByNode.get(shardRouting.currentNodeId()); if (currentShardForNode == null) { return true; } @@ -1025,12 +1029,12 @@ public boolean shardIsPreferableToCurrent(ShardRouting shardRouting) { return comparison < 0; } - public void putCurrentMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { - bestNonPreferredShardsByNode.put(shardRouting.currentNodeId(), new StoredMoveDecision(shardRouting, moveDecision)); + public void putBestMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { + bestShardMovementsByNode.put(shardRouting.currentNodeId(), new StoredMoveDecision(shardRouting, moveDecision)); } - public Iterable getPreferredShardMovements() { - return bestNonPreferredShardsByNode.values(); + public Iterable getBestShardMovements() { + return bestShardMovementsByNode.values(); } } From 3fef076a184958a678bb1ca16aae8ac020bbadc5 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 12:42:03 +1000 Subject: [PATCH 47/63] Improve naming (again) --- .../allocator/BalancedShardsAllocator.java | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 6c00a5c8c202a..319e92f21d94a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -836,13 +836,13 @@ public boolean moveShards() { } // If we get here, attempt to move one of the best not-preferred shards that we identified earlier - for (var potentialMove : bestNonPreferredShardMovementsTracker.getBestShardMovements()) { - final var shardRouting = potentialMove.shardRouting(); + for (var storedShardMovement : bestNonPreferredShardMovementsTracker.getBestShardMovements()) { + final var shardRouting = storedShardMovement.shardRouting(); final var index = projectIndex(shardRouting); // If `shardMoved` is true, there may have been moves that have made our previous move decision // invalid, so we must call `decideMove` again. If not, we know we haven't made any moves, and we // can use the cached decision. - final var moveDecision = shardMoved ? decideMove(index, shardRouting) : potentialMove.moveDecision(); + final var moveDecision = shardMoved ? decideMove(index, shardRouting) : storedShardMovement.moveDecision(); if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) { executeMove(shardRouting, index, moveDecision, "move-non-preferred"); // Return after a single move so that the change can be simulated before further moves are made. @@ -1003,10 +1003,10 @@ private MoveDecision decideMove( */ private class BestShardMovementsTracker { - public record StoredMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) {} + public record StoredShardMovement(ShardRouting shardRouting, MoveDecision moveDecision) {} // LinkedHashMap so we iterate in insertion order - private final Map bestShardMovementsByNode = new LinkedHashMap<>(); + private final Map bestShardMovementsByNode = new LinkedHashMap<>(); private final Map comparatorCache = new HashMap<>(); /** @@ -1030,10 +1030,10 @@ public boolean shardIsBetterThanCurrent(ShardRouting shardRouting) { } public void putBestMoveDecision(ShardRouting shardRouting, MoveDecision moveDecision) { - bestShardMovementsByNode.put(shardRouting.currentNodeId(), new StoredMoveDecision(shardRouting, moveDecision)); + bestShardMovementsByNode.put(shardRouting.currentNodeId(), new StoredShardMovement(shardRouting, moveDecision)); } - public Iterable getBestShardMovements() { + public Iterable getBestShardMovements() { return bestShardMovementsByNode.values(); } } From 2ea2ea371a94b0f562b73be4e00a840ae79511ea Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 13:04:02 +1000 Subject: [PATCH 48/63] Remove decideCanAllocatePreferredOnly --- .../allocator/BalancedShardsAllocator.java | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 319e92f21d94a..1e8587a138eb0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -932,10 +932,7 @@ private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, P * This is not guaranteed to be balanced after this operation we still try best effort to * allocate on the minimal eligible node. */ - final BiFunction decider = canRemain.type() == Type.NOT_PREFERRED - ? this::decideCanAllocatePreferredOnly - : this::decideCanAllocate; - final MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, decider); + final MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanAllocate); if (moveDecision.getCanRemainDecision().type() == Type.NO && moveDecision.canRemain() == false && moveDecision.forceMove() == false) { @@ -1136,15 +1133,6 @@ public int compare(ShardRouting lhs, ShardRouting rhs) { } } - private Decision decideCanAllocatePreferredOnly(ShardRouting shardRouting, RoutingNode target) { - final Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation); - // not-preferred means no here - if (decision.type() == Type.NOT_PREFERRED) { - return Decision.NO; - } - return decision; - } - private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) { // don't use canRebalance as we want hard filtering rules to apply. See #17698 return allocation.deciders().canAllocate(shardRouting, target, allocation); From 105d7bff4648a6133484e14dbf3e233374484a5b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 14:39:33 +1000 Subject: [PATCH 49/63] Remove redundant check --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 1e8587a138eb0..a10d8c8784c0f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -933,9 +933,7 @@ private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, P * allocate on the minimal eligible node. */ final MoveDecision moveDecision = decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanAllocate); - if (moveDecision.getCanRemainDecision().type() == Type.NO - && moveDecision.canRemain() == false - && moveDecision.forceMove() == false) { + if (moveDecision.canRemain() == false && moveDecision.forceMove() == false) { final boolean shardsOnReplacedNode = allocation.metadata().nodeShutdowns().contains(shardRouting.currentNodeId(), REPLACE); if (shardsOnReplacedNode) { return decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanForceAllocateForVacate); From 78372116baea0049463a7ccefd046ae7d71aed8b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Thu, 2 Oct 2025 15:16:32 +1000 Subject: [PATCH 50/63] Improve javadoc --- .../allocation/allocator/BalancedShardsAllocator.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index a10d8c8784c0f..31b201e85f355 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -874,9 +874,11 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci } /** - * Decide whether to move a started shard to another node. - * Unconditional version of {@link #decideMove(ProjectIndex, ShardRouting, Predicate)} + * Makes a decision on whether to move a started shard to another node. + *

    + * This overload will always assess move options for non-preferred allocations. * + * @see #decideMove(ProjectIndex, ShardRouting, Predicate) * @param index The index that the shard being considered belongs to * @param shardRouting The shard routing being considered for movement * @return The {@link MoveDecision} for the shard From cc147c92530aa701c2908cb0882d6ade068328db Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 10:46:26 +1100 Subject: [PATCH 51/63] Improve javadoc --- .../allocation/allocator/BalancedShardsAllocator.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 31b201e85f355..eb8742155c3d2 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -876,7 +876,8 @@ private void executeMove(ShardRouting shardRouting, ProjectIndex index, MoveDeci /** * Makes a decision on whether to move a started shard to another node. *

    - * This overload will always assess move options for non-preferred allocations. + * This overload will always search for relocation targets for {@link Decision#NOT_PREFERRED} + * allocations. * * @see #decideMove(ProjectIndex, ShardRouting, Predicate) * @param index The index that the shard being considered belongs to @@ -902,7 +903,11 @@ public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shar * * @param index The index that the shard being considered belongs to * @param shardRouting The shard routing being considered for movement - * @param nonPreferredPredicate A predicate used to determine whether to assess move options for shards in non-preferred allocations + * @param nonPreferredPredicate A predicate applied to every assignment for which + * {@link AllocationDeciders#canRemain(ShardRouting, RoutingNode, RoutingAllocation)} returns + * {@link Type#NOT_PREFERRED}. If the predicate returns true, a search for relocation targets will be + * performed, if it returns false no search will be performed and {@link MoveDecision#NOT_TAKEN} will + * be returned. * @return The {@link MoveDecision} for the shard */ private MoveDecision decideMove(ProjectIndex index, ShardRouting shardRouting, Predicate nonPreferredPredicate) { From 35dd5d4b43929d8532c4a674867a631bf669fae3 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 10:52:29 +1100 Subject: [PATCH 52/63] Improve javadoc (again) --- .../routing/allocation/allocator/BalancedShardsAllocator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index eb8742155c3d2..9e0d98d7273c0 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -903,7 +903,7 @@ public MoveDecision decideMove(final ProjectIndex index, final ShardRouting shar * * @param index The index that the shard being considered belongs to * @param shardRouting The shard routing being considered for movement - * @param nonPreferredPredicate A predicate applied to every assignment for which + * @param nonPreferredPredicate A predicate applied to assignments for which * {@link AllocationDeciders#canRemain(ShardRouting, RoutingNode, RoutingAllocation)} returns * {@link Type#NOT_PREFERRED}. If the predicate returns true, a search for relocation targets will be * performed, if it returns false no search will be performed and {@link MoveDecision#NOT_TAKEN} will From c32d6e93f0cfd9bfb4bd72aee39a43d26898bca1 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:00:41 +1100 Subject: [PATCH 53/63] Improve javadoc --- .../allocation/allocator/BalancedShardsAllocator.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index 9e0d98d7273c0..a7fc04afc5ee8 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1000,8 +1000,9 @@ private MoveDecision decideMove( } /** - * Stores the most desirable shard seen so far and compares proposed shards against it using - * the {@link PrioritiseByShardWriteLoadComparator}. + * Keeps track of the single "best" shard movement we could make from each node, as scored by + * the {@link PrioritiseByShardWriteLoadComparator}. Provides a utility for checking if + * a proposed movement is "better" than the current best for that node. */ private class BestShardMovementsTracker { From 291416af46f5be82f54337a63e2ce91cc12c6264 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:10:11 +1100 Subject: [PATCH 54/63] Use constant for threshold --- .../allocator/BalancedShardsAllocator.java | 2 +- .../BalancedShardsAllocatorTests.java | 30 +++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index a7fc04afc5ee8..ca0710bedb2a7 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1067,7 +1067,7 @@ static class PrioritiseByShardWriteLoadComparator implements Comparator * We prefer to move shards that have a write-load close to this value x {@link #maxWriteLoadOnNode}. */ - private static final double THRESHOLD_RATIO = 0.5; + public static final double THRESHOLD_RATIO = 0.5; private static final double MISSING_WRITE_LOAD = -1; private final Map shardWriteLoads; private final double maxWriteLoadOnNode; diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 7178a04bb0359..e3f0e6a38628a 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -75,6 +75,7 @@ import static java.util.stream.Collectors.toSet; import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING; import static org.elasticsearch.cluster.routing.TestShardRouting.shardRoutingBuilder; +import static org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator.THRESHOLD_RATIO; import static org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.DISK_USAGE_BALANCE_FACTOR_SETTING; import static org.elasticsearch.cluster.routing.allocation.allocator.WeightFunction.getIndexDiskUsageInBytes; import static org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider.SETTING_IGNORE_DISK_WATERMARKS; @@ -929,12 +930,12 @@ public void testReturnEarlyOnShardAssignmentChanges() { public void testShardMovementPriorityComparator() { final double maxWriteLoad = randomDoubleBetween(0.0, 100.0, true); - final double lowThreshold = maxWriteLoad * 0.5; + final double writeLoadThreshold = maxWriteLoad * THRESHOLD_RATIO; final int numAtMax = between(1, 5); - final int numBetweenLowAndMax = between(0, 50); - final int numBelowLow = between(0, 50); + final int numBetweenThresholdAndMax = between(0, 50); + final int numBelowThreshold = between(0, 50); final int numMissing = between(0, 50); - final int totalShards = numAtMax + numBetweenLowAndMax + numBelowLow + numMissing; + final int totalShards = numAtMax + numBetweenThresholdAndMax + numBelowThreshold + numMissing; final var indices = new ArrayList(); for (int i = 0; i < totalShards; i++) { @@ -954,10 +955,15 @@ public void testShardMovementPriorityComparator() { addRandomWriteLoadAndRemoveShard( shardWriteLoads, allShards, - numBetweenLowAndMax, - () -> randomDoubleBetween(lowThreshold, maxWriteLoad, true) + numBetweenThresholdAndMax, + () -> randomDoubleBetween(writeLoadThreshold, maxWriteLoad, true) + ); + addRandomWriteLoadAndRemoveShard( + shardWriteLoads, + allShards, + numBelowThreshold, + () -> randomDoubleBetween(0, writeLoadThreshold, true) ); - addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numBelowLow, () -> randomDoubleBetween(0, lowThreshold, true)); assertThat(allShards, hasSize(numMissing)); final var allocation = new RoutingAllocation( @@ -979,7 +985,7 @@ public void testShardMovementPriorityComparator() { allocatedRoutingNodes.node(nodeId) ); - logger.info("--> testing shard movement priority comparator, maxValue={}, threshold={}", maxWriteLoad, lowThreshold); + logger.info("--> testing shard movement priority comparator, maxValue={}, threshold={}", maxWriteLoad, writeLoadThreshold); var sortedShards = allocatedRoutingNodes.getAssignedShards().values().stream().flatMap(List::stream).sorted(comparator).toList(); for (ShardRouting shardRouting : sortedShards) { @@ -989,8 +995,8 @@ public void testShardMovementPriorityComparator() { double lastWriteLoad = 0.0; int currentIndex = 0; - logger.info("--> expecting {} between low and max in ascending order", numBetweenLowAndMax); - for (int i = 0; i < numBetweenLowAndMax; i++) { + logger.info("--> expecting {} between threshold and max in ascending order", numBetweenThresholdAndMax); + for (int i = 0; i < numBetweenThresholdAndMax; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); @@ -1000,8 +1006,8 @@ public void testShardMovementPriorityComparator() { assertThat(currentWriteLoad, greaterThanOrEqualTo(lastWriteLoad)); } } - logger.info("--> expecting {} below low in descending order", numBelowLow); - for (int i = 0; i < numBelowLow; i++) { + logger.info("--> expecting {} below threshold in descending order", numBelowThreshold); + for (int i = 0; i < numBelowThreshold; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); From 0bfcf6b7d6d9029d434533d70e4b515ee5b0ac1f Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:14:24 +1100 Subject: [PATCH 55/63] Document test utility --- .../allocator/BalancedShardsAllocatorTests.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index e3f0e6a38628a..a665fc10c7da8 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -1031,6 +1031,14 @@ public void testShardMovementPriorityComparator() { } } + /** + * Randomly select a shard and add a random write-load for it + * + * @param shardWriteLoads The map of shards to write-loads, this will be added to + * @param shards The set of shards to select from, selected shards will be removed from this set + * @param count The number of shards to generate write loads for + * @param writeLoadSupplier The supplier of random write loads to use + */ private void addRandomWriteLoadAndRemoveShard( Map shardWriteLoads, Set shards, From 76681011c45c4995961d9521aeb352b675425df1 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:26:10 +1100 Subject: [PATCH 56/63] Explain single shard indices --- .../allocation/allocator/BalancedShardsAllocatorTests.java | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index a665fc10c7da8..5366fc16cc608 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -937,6 +937,7 @@ public void testShardMovementPriorityComparator() { final int numMissing = between(0, 50); final int totalShards = numAtMax + numBetweenThresholdAndMax + numBelowThreshold + numMissing; + // We create single-shard indices for simplicity's sake and to make it clear the shards are independent of each other final var indices = new ArrayList(); for (int i = 0; i < totalShards; i++) { indices.add(anIndex("index-" + i).numberOfShards(1).numberOfReplicas(0)); From f3588e281d2ad42b799f76bddfa0258ca9971763 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:28:55 +1100 Subject: [PATCH 57/63] Improve naming --- .../BalancedShardsAllocatorTests.java | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 5366fc16cc608..7afc8483dcf06 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -931,11 +931,12 @@ public void testReturnEarlyOnShardAssignmentChanges() { public void testShardMovementPriorityComparator() { final double maxWriteLoad = randomDoubleBetween(0.0, 100.0, true); final double writeLoadThreshold = maxWriteLoad * THRESHOLD_RATIO; - final int numAtMax = between(1, 5); - final int numBetweenThresholdAndMax = between(0, 50); - final int numBelowThreshold = between(0, 50); - final int numMissing = between(0, 50); - final int totalShards = numAtMax + numBetweenThresholdAndMax + numBelowThreshold + numMissing; + final int numberOfShardsWithMaxWriteLoad = between(1, 5); + final int numberOfShardsWithWriteLoadBetweenThresholdAndMax = between(0, 50); + final int numberOfShardsWithWriteLoadBelowThreshold = between(0, 50); + final int numberOfShardsWithNoWriteLoad = between(0, 50); + final int totalShards = numberOfShardsWithMaxWriteLoad + numberOfShardsWithWriteLoadBetweenThresholdAndMax + + numberOfShardsWithWriteLoadBelowThreshold + numberOfShardsWithNoWriteLoad; // We create single-shard indices for simplicity's sake and to make it clear the shards are independent of each other final var indices = new ArrayList(); @@ -952,20 +953,20 @@ public void testShardMovementPriorityComparator() { final var allShards = stateWithIndices.routingTable(ProjectId.DEFAULT).allShards().collect(toSet()); final var shardWriteLoads = new HashMap(); - addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numAtMax, () -> maxWriteLoad); + addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numberOfShardsWithMaxWriteLoad, () -> maxWriteLoad); addRandomWriteLoadAndRemoveShard( shardWriteLoads, allShards, - numBetweenThresholdAndMax, + numberOfShardsWithWriteLoadBetweenThresholdAndMax, () -> randomDoubleBetween(writeLoadThreshold, maxWriteLoad, true) ); addRandomWriteLoadAndRemoveShard( shardWriteLoads, allShards, - numBelowThreshold, + numberOfShardsWithWriteLoadBelowThreshold, () -> randomDoubleBetween(0, writeLoadThreshold, true) ); - assertThat(allShards, hasSize(numMissing)); + assertThat(allShards, hasSize(numberOfShardsWithNoWriteLoad)); final var allocation = new RoutingAllocation( new AllocationDeciders(List.of()), @@ -996,8 +997,8 @@ public void testShardMovementPriorityComparator() { double lastWriteLoad = 0.0; int currentIndex = 0; - logger.info("--> expecting {} between threshold and max in ascending order", numBetweenThresholdAndMax); - for (int i = 0; i < numBetweenThresholdAndMax; i++) { + logger.info("--> expecting {} between threshold and max in ascending order", numberOfShardsWithWriteLoadBetweenThresholdAndMax); + for (int i = 0; i < numberOfShardsWithWriteLoadBetweenThresholdAndMax; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); @@ -1007,8 +1008,8 @@ public void testShardMovementPriorityComparator() { assertThat(currentWriteLoad, greaterThanOrEqualTo(lastWriteLoad)); } } - logger.info("--> expecting {} below threshold in descending order", numBelowThreshold); - for (int i = 0; i < numBelowThreshold; i++) { + logger.info("--> expecting {} below threshold in descending order", numberOfShardsWithWriteLoadBelowThreshold); + for (int i = 0; i < numberOfShardsWithWriteLoadBelowThreshold; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); @@ -1018,15 +1019,15 @@ public void testShardMovementPriorityComparator() { assertThat(currentWriteLoad, lessThanOrEqualTo(lastWriteLoad)); } } - logger.info("--> expecting {} at max", numAtMax); - for (int i = 0; i < numAtMax; i++) { + logger.info("--> expecting {} at max", numberOfShardsWithMaxWriteLoad); + for (int i = 0; i < numberOfShardsWithMaxWriteLoad; i++) { final var currentShardId = sortedShards.get(currentIndex++).shardId(); assertThat(shardWriteLoads, Matchers.hasKey(currentShardId)); final double currentWriteLoad = shardWriteLoads.get(currentShardId); assertThat(currentWriteLoad, equalTo(maxWriteLoad)); } - logger.info("--> expecting {} missing", numMissing); - for (int i = 0; i < numMissing; i++) { + logger.info("--> expecting {} missing", numberOfShardsWithNoWriteLoad); + for (int i = 0; i < numberOfShardsWithNoWriteLoad; i++) { final var currentShardId = sortedShards.get(currentIndex++); assertThat(shardWriteLoads, Matchers.not(Matchers.hasKey(currentShardId.shardId()))); } From 7643dd0e64b2791b9dfe87c12a7186bc8ed9ce66 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 11:35:24 +1100 Subject: [PATCH 58/63] Improve naming --- .../allocator/BalancedShardsAllocatorTests.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index 7afc8483dcf06..b13d4c3ee88db 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -945,13 +945,9 @@ public void testShardMovementPriorityComparator() { } final var nodeId = randomIdentifier(); - final var stateWithIndices = createStateWithIndices( - List.of(nodeId), - shardId -> nodeId, - indices.toArray(IndexMetadata.Builder[]::new) - ); + final var clusterState = createStateWithIndices(List.of(nodeId), shardId -> nodeId, indices.toArray(IndexMetadata.Builder[]::new)); - final var allShards = stateWithIndices.routingTable(ProjectId.DEFAULT).allShards().collect(toSet()); + final var allShards = clusterState.routingTable(ProjectId.DEFAULT).allShards().collect(toSet()); final var shardWriteLoads = new HashMap(); addRandomWriteLoadAndRemoveShard(shardWriteLoads, allShards, numberOfShardsWithMaxWriteLoad, () -> maxWriteLoad); addRandomWriteLoadAndRemoveShard( @@ -970,7 +966,7 @@ public void testShardMovementPriorityComparator() { final var allocation = new RoutingAllocation( new AllocationDeciders(List.of()), - stateWithIndices, + clusterState, ClusterInfo.builder().shardWriteLoads(shardWriteLoads).build(), SNAPSHOT_INFO_SERVICE_WITH_NO_SHARD_SIZES.snapshotShardSizes(), System.nanoTime() From e514692f49fc93920feab90fd0f4c4d169d6b206 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 12:09:10 +1100 Subject: [PATCH 59/63] Reference comparator in javadoc --- .../allocator/BalancedShardsAllocatorTests.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index b13d4c3ee88db..e4a4d462062ca 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -36,6 +36,7 @@ import org.elasticsearch.cluster.routing.ShardRoutingState; import org.elasticsearch.cluster.routing.allocation.AllocateUnassignedDecision; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; +import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider; import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders; import org.elasticsearch.cluster.routing.allocation.decider.Decision; @@ -928,6 +929,10 @@ public void testReturnEarlyOnShardAssignmentChanges() { applyStartedShardsUntilNoChange(clusterState, allocationService); } + /** + * Test for {@link PrioritiseByShardWriteLoadComparator}. See Comparator Javadoc for expected + * ordering. + */ public void testShardMovementPriorityComparator() { final double maxWriteLoad = randomDoubleBetween(0.0, 100.0, true); final double writeLoadThreshold = maxWriteLoad * THRESHOLD_RATIO; @@ -978,10 +983,7 @@ public void testShardMovementPriorityComparator() { allocatedRoutingNodes.initializeShard(shardRouting, nodeId, null, randomNonNegativeLong(), NOOP); } - final var comparator = new BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator( - allocation, - allocatedRoutingNodes.node(nodeId) - ); + final var comparator = new PrioritiseByShardWriteLoadComparator(allocation, allocatedRoutingNodes.node(nodeId)); logger.info("--> testing shard movement priority comparator, maxValue={}, threshold={}", maxWriteLoad, writeLoadThreshold); var sortedShards = allocatedRoutingNodes.getAssignedShards().values().stream().flatMap(List::stream).sorted(comparator).toList(); From ddc77392cf6bd2f8b8b9532bbfe53d5b9d90a57b Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 12:14:45 +1100 Subject: [PATCH 60/63] Document test decider --- .../allocation/allocator/BalancedShardsAllocatorTests.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java index e4a4d462062ca..f46b83c9cb3c3 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java @@ -1270,8 +1270,9 @@ private Decision nodePrefixMatchesIndexPrefix(ShardRouting shardRouting, Routing } /** - * Returns {@link Decision#NOT_PREFERRED} for any shard with 'not-preferred' in the index - * name, until it's moved to another node. + * Returns {@link Decision#NOT_PREFERRED} from {@link #canRemain(IndexMetadata, ShardRouting, RoutingNode, RoutingAllocation)} for + * any shard with 'not-preferred' in the index name. Once the shard has been moved to a different node, {@link Decision#YES} is + * returned. */ private static class MoveNotPreferredOnceDecider extends AllocationDecider { From 4170af3bfc7edad211930af50f43b618b8401e28 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Mon, 6 Oct 2025 15:50:17 +1100 Subject: [PATCH 61/63] Test movement prioritization in IT --- .../decider/WriteLoadConstraintDeciderIT.java | 86 +++++++++++++++---- .../allocator/BalancedShardsAllocator.java | 8 +- .../BalancedShardsAllocatorTests.java | 2 +- 3 files changed, 73 insertions(+), 23 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java index 2548836b9e634..fb0363e8c2b37 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java @@ -28,6 +28,7 @@ import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.cluster.routing.UnassignedInfo; import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings; +import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator; import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceMetrics; import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator; import org.elasticsearch.cluster.service.ClusterService; @@ -54,14 +55,18 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.CountDownLatch; +import java.util.stream.StreamSupport; import static java.util.stream.IntStream.range; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_REPLICAS; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_NUMBER_OF_SHARDS; +import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.everyItem; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.hasSize; @@ -130,7 +135,7 @@ public void testHighNodeWriteLoadPreventsNewShardAllocation() { setUpMockTransportIndicesStatsResponse( harness.firstDiscoveryNode, indexMetadata.getNumberOfShards(), - createShardStatsResponseForIndex(indexMetadata, harness.randomShardWriteLoad, harness.firstDataNodeId) + createShardStatsResponseForIndex(indexMetadata, harness.maxShardWriteLoad, harness.firstDataNodeId) ); setUpMockTransportIndicesStatsResponse(harness.secondDiscoveryNode, 0, List.of()); setUpMockTransportIndicesStatsResponse(harness.thirdDiscoveryNode, 0, List.of()); @@ -235,7 +240,7 @@ public void testShardsAreAssignedToNotPreferredWhenAlternativeIsNo() { setUpMockTransportIndicesStatsResponse( harness.firstDiscoveryNode, indexMetadata.getNumberOfShards(), - createShardStatsResponseForIndex(indexMetadata, harness.randomShardWriteLoad, harness.firstDataNodeId) + createShardStatsResponseForIndex(indexMetadata, harness.maxShardWriteLoad, harness.firstDataNodeId) ); setUpMockTransportIndicesStatsResponse(harness.secondDiscoveryNode, 0, List.of()); setUpMockTransportIndicesStatsResponse(harness.thirdDiscoveryNode, 0, List.of()); @@ -333,7 +338,7 @@ public void testCanRemainNotPreferredIsIgnoredWhenAllOtherNodesReturnNotPreferre setUpMockTransportIndicesStatsResponse( harness.firstDiscoveryNode, indexMetadata.getNumberOfShards(), - createShardStatsResponseForIndex(indexMetadata, harness.randomShardWriteLoad, harness.firstDataNodeId) + createShardStatsResponseForIndex(indexMetadata, harness.maxShardWriteLoad, harness.firstDataNodeId) ); setUpMockTransportIndicesStatsResponse(harness.secondDiscoveryNode, 0, List.of()); setUpMockTransportIndicesStatsResponse(harness.thirdDiscoveryNode, 0, List.of()); @@ -429,15 +434,12 @@ public void testCanRemainRelocatesOneShardWhenAHotSpotOccurs() { * will show that all shards have non-empty write load stats (so that the WriteLoadDecider will evaluate assigning them to a node). */ - IndexMetadata indexMetadata = internalCluster().getCurrentMasterNodeInstance(ClusterService.class) - .state() - .getMetadata() - .getProject() - .index(harness.indexName); + final ClusterState originalClusterState = internalCluster().getCurrentMasterNodeInstance(ClusterService.class).state(); + final IndexMetadata indexMetadata = originalClusterState.getMetadata().getProject().index(harness.indexName); setUpMockTransportIndicesStatsResponse( harness.firstDiscoveryNode, indexMetadata.getNumberOfShards(), - createShardStatsResponseForIndex(indexMetadata, harness.randomShardWriteLoad, harness.firstDataNodeId) + createShardStatsResponseForIndex(indexMetadata, harness.maxShardWriteLoad, harness.firstDataNodeId) ); setUpMockTransportIndicesStatsResponse(harness.secondDiscoveryNode, 0, List.of()); setUpMockTransportIndicesStatsResponse(harness.thirdDiscoveryNode, 0, List.of()); @@ -483,6 +485,7 @@ public void testCanRemainRelocatesOneShardWhenAHotSpotOccurs() { harness.randomNumberOfShards, countShardsStillAssignedToFirstNode + 1 ); + assertThatTheBestShardWasMoved(harness, originalClusterState, desiredBalanceResponse); } catch (AssertionError error) { ClusterState state = client().admin() .cluster() @@ -498,6 +501,36 @@ public void testCanRemainRelocatesOneShardWhenAHotSpotOccurs() { } } + /** + * Determine which shard was moved and check that it's the "best" according to + * {@link org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator} + */ + private void assertThatTheBestShardWasMoved( + TestHarness harness, + ClusterState originalClusterState, + DesiredBalanceResponse desiredBalanceResponse + ) { + int movedShardId = desiredBalanceResponse.getRoutingTable().get(harness.indexName).entrySet().stream().filter(e -> { + Set desiredNodeIds = e.getValue().desired().nodeIds(); + return desiredNodeIds.contains(harness.secondDiscoveryNode.getId()) + || desiredNodeIds.contains(harness.thirdDiscoveryNode.getId()); + }).findFirst().map(Map.Entry::getKey).orElseThrow(() -> new AssertionError("No shard was moved to a non-hot-spotting node")); + + final BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator comparator = + new BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator( + desiredBalanceResponse.getClusterInfo(), + originalClusterState.getRoutingNodes().node(harness.firstDataNodeId) + ); + + final List bestShardsToMove = StreamSupport.stream( + originalClusterState.getRoutingNodes().node(harness.firstDataNodeId).spliterator(), + false + ).sorted(comparator).toList(); + + // The moved shard should be at the head of the sorted list + assertThat(movedShardId, equalTo(bestShardsToMove.get(0).shardId().id())); + } + public void testMaxQueueLatencyMetricIsPublished() { final Settings settings = Settings.builder() .put( @@ -663,12 +696,29 @@ private NodeUsageStatsForThreadPools createNodeUsageStatsForThreadPools( */ private List createShardStatsResponseForIndex( IndexMetadata indexMetadata, - float peakShardWriteLoad, + float maximumShardWriteLoad, String assignedShardNodeId ) { - List shardStats = new ArrayList<>(indexMetadata.getNumberOfShards()); + // Randomly distribute shards' write-loads we can see that they're being prioritized correctly + final double writeLoadThreshold = maximumShardWriteLoad + * BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator.THRESHOLD_RATIO; + final List shardPeakWriteLoads = new ArrayList<>(); + // Need at least one with the maximum write-load + shardPeakWriteLoads.add((double) maximumShardWriteLoad); + final int remainingShards = indexMetadata.getNumberOfShards() - 1; + // Some over-threshold, some under + for (int i = 0; i < remainingShards; ++i) { + if (randomBoolean()) { + shardPeakWriteLoads.add(randomDoubleBetween(writeLoadThreshold, maximumShardWriteLoad, true)); + } else { + shardPeakWriteLoads.add(randomDoubleBetween(0.0, writeLoadThreshold, true)); + } + } + assertThat(shardPeakWriteLoads, hasSize(indexMetadata.getNumberOfShards())); + Collections.shuffle(shardPeakWriteLoads, random()); + final List shardStats = new ArrayList<>(indexMetadata.getNumberOfShards()); for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) { - shardStats.add(createShardStats(indexMetadata, i, peakShardWriteLoad, assignedShardNodeId)); + shardStats.add(createShardStats(indexMetadata, i, shardPeakWriteLoads.get(i), assignedShardNodeId)); } return shardStats; } @@ -719,7 +769,7 @@ private TestHarness setUpThreeTestNodesAndAllIndexShardsOnFirstNode() { int randomUtilizationThresholdPercent = randomIntBetween(50, 100); int randomNumberOfWritePoolThreads = randomIntBetween(2, 20); long randomQueueLatencyThresholdMillis = randomLongBetween(1, 20_000); - float randomShardWriteLoad = randomFloatBetween(0.0f, 0.01f, false); + float maximumShardWriteLoad = randomFloatBetween(0.0f, 0.01f, false); Settings settings = enabledWriteLoadDeciderSettings(randomUtilizationThresholdPercent, randomQueueLatencyThresholdMillis); internalCluster().startMasterOnlyNode(settings); @@ -756,8 +806,8 @@ private TestHarness setUpThreeTestNodesAndAllIndexShardsOnFirstNode() { + randomUtilizationThresholdPercent + ", write threads: " + randomNumberOfWritePoolThreads - + ", individual shard write loads: " - + randomShardWriteLoad + + ", maximum shard write load: " + + maximumShardWriteLoad ); /** @@ -775,7 +825,7 @@ private TestHarness setUpThreeTestNodesAndAllIndexShardsOnFirstNode() { // Calculate the maximum utilization a node can report while still being able to accept all relocating shards int shardWriteLoadOverhead = shardLoadUtilizationOverhead( - randomShardWriteLoad * randomNumberOfShards, + maximumShardWriteLoad * randomNumberOfShards, randomNumberOfWritePoolThreads ); int maxUtilBelowThresholdThatAllowsAllShardsToRelocate = randomUtilizationThresholdPercent - shardWriteLoadOverhead - 1; @@ -819,7 +869,7 @@ private TestHarness setUpThreeTestNodesAndAllIndexShardsOnFirstNode() { randomUtilizationThresholdPercent, randomNumberOfWritePoolThreads, randomQueueLatencyThresholdMillis, - randomShardWriteLoad, + maximumShardWriteLoad, indexName, randomNumberOfShards, maxUtilBelowThresholdThatAllowsAllShardsToRelocate @@ -842,7 +892,7 @@ record TestHarness( int randomUtilizationThresholdPercent, int randomNumberOfWritePoolThreads, long randomQueueLatencyThresholdMillis, - float randomShardWriteLoad, + float maxShardWriteLoad, String indexName, int randomNumberOfShards, int maxUtilBelowThresholdThatAllowsAllShardsToRelocate diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java index ca0710bedb2a7..fe42a3dfe7ecd 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java @@ -1026,7 +1026,7 @@ public boolean shardIsBetterThanCurrent(ShardRouting shardRouting) { } int comparison = comparatorCache.computeIfAbsent( shardRouting.currentNodeId(), - nodeId -> new PrioritiseByShardWriteLoadComparator(allocation, allocation.routingNodes().node(nodeId)) + nodeId -> new PrioritiseByShardWriteLoadComparator(allocation.clusterInfo(), allocation.routingNodes().node(nodeId)) ).compare(shardRouting, currentShardForNode.shardRouting()); // Ignore inferior non-preferred moves return comparison < 0; @@ -1059,7 +1059,7 @@ public Iterable getBestShardMovements() { * */ // Visible for testing - static class PrioritiseByShardWriteLoadComparator implements Comparator { + public static class PrioritiseByShardWriteLoadComparator implements Comparator { /** * This is the threshold over which we consider shards to have a "high" write load represented @@ -1074,8 +1074,8 @@ static class PrioritiseByShardWriteLoadComparator implements Comparator testing shard movement priority comparator, maxValue={}, threshold={}", maxWriteLoad, writeLoadThreshold); var sortedShards = allocatedRoutingNodes.getAssignedShards().values().stream().flatMap(List::stream).sorted(comparator).toList(); From 43a2dd3d7e889fb8880d6fe3984ef4bb586d1b93 Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 7 Oct 2025 09:54:24 +1100 Subject: [PATCH 62/63] Update server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java Co-authored-by: Dianna Hohensee --- .../allocation/decider/WriteLoadConstraintDeciderIT.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java index fb0363e8c2b37..63d498a7998b1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java @@ -699,7 +699,7 @@ private List createShardStatsResponseForIndex( float maximumShardWriteLoad, String assignedShardNodeId ) { - // Randomly distribute shards' write-loads we can see that they're being prioritized correctly + // Randomly distribute shards' write-loads so that we can check later that shard movements are prioritized correctly final double writeLoadThreshold = maximumShardWriteLoad * BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator.THRESHOLD_RATIO; final List shardPeakWriteLoads = new ArrayList<>(); From 29a79e13f14e60d6856ff411a61405c7ba317aaa Mon Sep 17 00:00:00 2001 From: Nick Tindall Date: Tue, 7 Oct 2025 09:56:43 +1100 Subject: [PATCH 63/63] Fix Javadoc --- .../allocation/decider/WriteLoadConstraintDeciderIT.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java index 63d498a7998b1..2021818db57b1 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java @@ -692,14 +692,16 @@ private NodeUsageStatsForThreadPools createNodeUsageStatsForThreadPools( } /** - * Helper to create a list of dummy {@link ShardStats} for the given index, each shard reporting a {@code peakShardWriteLoad} stat. + * Helper to create a list of dummy {@link ShardStats} for the given index, each shard being randomly allocated a peak write load + * between 0 and {@code maximumShardWriteLoad}. There will always be at least one shard reporting the specified + * {@code maximumShardWriteLoad}. */ private List createShardStatsResponseForIndex( IndexMetadata indexMetadata, float maximumShardWriteLoad, String assignedShardNodeId ) { - // Randomly distribute shards' write-loads so that we can check later that shard movements are prioritized correctly + // Randomly distribute shards' peak write-loads so that we can check later that shard movements are prioritized correctly final double writeLoadThreshold = maximumShardWriteLoad * BalancedShardsAllocator.Balancer.PrioritiseByShardWriteLoadComparator.THRESHOLD_RATIO; final List shardPeakWriteLoads = new ArrayList<>();