elastic · nicktindall · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java b/server/src/main/java/org/elasticsearch/cluster/ClusterModule.java
@@ -39,10 +39,12 @@
 import org.elasticsearch.cluster.routing.allocation.NodeAllocationStatsAndWeightsCalculator;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.ShardAllocationDecision;
+import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings;
 import org.elasticsearch.cluster.routing.allocation.WriteLoadForecaster;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancerSettings;
 import org.elasticsearch.cluster.routing.allocation.allocator.BalancingWeightsFactory;
+import org.elasticsearch.cluster.routing.allocation.allocator.DefaultNonPreferredShardIteratorFactory;
 import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceMetrics;
 import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator;
 import org.elasticsearch.cluster.routing.allocation.allocator.DesiredBalanceShardsAllocator.DesiredBalanceReconcilerAction;
@@ -504,16 +506,30 @@ private static ShardsAllocator createShardsAllocator(
         ShardAllocationExplainer shardAllocationExplainer,
         DesiredBalanceMetrics desiredBalanceMetrics
     ) {
+        WriteLoadConstraintSettings writeLoadConstraintSettings = new WriteLoadConstraintSettings(clusterSettings);
+        DefaultNonPreferredShardIteratorFactory nonPreferredShardIteratorFactory = new DefaultNonPreferredShardIteratorFactory(
+            writeLoadConstraintSettings
+        );
         Map<String, Supplier<ShardsAllocator>> allocators = new HashMap<>();
         allocators.put(
             BALANCED_ALLOCATOR,
-            () -> new BalancedShardsAllocator(balancerSettings, writeLoadForecaster, balancingWeightsFactory)
+            () -> new BalancedShardsAllocator(
+                balancerSettings,
+                writeLoadForecaster,
+                balancingWeightsFactory,
+                nonPreferredShardIteratorFactory
+            )
         );
         allocators.put(
             DESIRED_BALANCE_ALLOCATOR,
             () -> new DesiredBalanceShardsAllocator(
                 clusterSettings,
-                new BalancedShardsAllocator(balancerSettings, writeLoadForecaster, balancingWeightsFactory),
+                new BalancedShardsAllocator(
+                    balancerSettings,
+                    writeLoadForecaster,
+                    balancingWeightsFactory,
+                    nonPreferredShardIteratorFactory
+                ),
                 threadPool,
                 clusterService,
                 reconciler,

diff --git a/.../java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java b/.../java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@@ -114,6 +114,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
     private final BalancerSettings balancerSettings;
     private final WriteLoadForecaster writeLoadForecaster;
     private final BalancingWeightsFactory balancingWeightsFactory;
+    private final NonPreferredShardIteratorFactory nonPreferredShardIteratorFactory;
 
     public BalancedShardsAllocator() {
         this(Settings.EMPTY);
@@ -124,18 +125,25 @@ public BalancedShardsAllocator(Settings settings) {
     }
 
     public BalancedShardsAllocator(BalancerSettings balancerSettings, WriteLoadForecaster writeLoadForecaster) {
-        this(balancerSettings, writeLoadForecaster, new GlobalBalancingWeightsFactory(balancerSettings));
+        this(
+            balancerSettings,
+            writeLoadForecaster,
+            new GlobalBalancingWeightsFactory(balancerSettings),
+            NonPreferredShardIteratorFactory.NOOP
+        );
     }
 
     @Inject
     public BalancedShardsAllocator(
         BalancerSettings balancerSettings,
         WriteLoadForecaster writeLoadForecaster,
-        BalancingWeightsFactory balancingWeightsFactory
+        BalancingWeightsFactory balancingWeightsFactory,
+        NonPreferredShardIteratorFactory nonPreferredShardIteratorFactory
     ) {
         this.balancerSettings = balancerSettings;
         this.writeLoadForecaster = writeLoadForecaster;
         this.balancingWeightsFactory = balancingWeightsFactory;
+        this.nonPreferredShardIteratorFactory = nonPreferredShardIteratorFactory;
     }
 
     @Override
@@ -152,9 +160,16 @@ public void allocate(RoutingAllocation allocation) {
             return;
         }
         final BalancingWeights balancingWeights = balancingWeightsFactory.create();
-        final Balancer balancer = new Balancer(writeLoadForecaster, allocation, balancerSettings.getThreshold(), balancingWeights);
+        final Balancer balancer = new Balancer(
+            writeLoadForecaster,
+            allocation,
+            balancerSettings.getThreshold(),
+            balancingWeights,
+            nonPreferredShardIteratorFactory
+        );
         balancer.allocateUnassigned();
         balancer.moveShards();
+        balancer.moveNonPreferred();
         balancer.balance();
 
         // Node weights are calculated after each internal balancing round and saved to the RoutingNodes copy.
@@ -188,7 +203,8 @@ public ShardAllocationDecision decideShardAllocation(final ShardRouting shard, f
             writeLoadForecaster,
             allocation,
             balancerSettings.getThreshold(),
-            balancingWeightsFactory.create()
+            balancingWeightsFactory.create(),
+            nonPreferredShardIteratorFactory
         );
         AllocateUnassignedDecision allocateUnassignedDecision = AllocateUnassignedDecision.NOT_TAKEN;
         MoveDecision moveDecision = MoveDecision.NOT_TAKEN;
@@ -248,12 +264,14 @@ public static class Balancer {
         private final Map<String, ModelNode> nodes;
         private final BalancingWeights balancingWeights;
         private final NodeSorters nodeSorters;
+        private final NonPreferredShardIteratorFactory nonPreferredShardIteratorFactory;
 
         private Balancer(
             WriteLoadForecaster writeLoadForecaster,
             RoutingAllocation allocation,
             float threshold,
-            BalancingWeights balancingWeights
+            BalancingWeights balancingWeights,
+            NonPreferredShardIteratorFactory nonPreferredShardIteratorFactory
         ) {
             this.writeLoadForecaster = writeLoadForecaster;
             this.allocation = allocation;
@@ -266,6 +284,7 @@ private Balancer(
             nodes = Collections.unmodifiableMap(buildModelFromAssigned());
             this.nodeSorters = balancingWeights.createNodeSorters(nodesArray(), this);
             this.balancingWeights = balancingWeights;
+            this.nonPreferredShardIteratorFactory = nonPreferredShardIteratorFactory;
         }
 
         private static long getShardDiskUsageInBytes(ShardRouting shardRouting, IndexMetadata indexMetadata, ClusterInfo clusterInfo) {
@@ -711,6 +730,94 @@ protected int comparePivot(int j) {
             return indices;
         }
 
+        /**
+         * Move started shards that are in non-preferred allocations
+         */
+        public void moveNonPreferred() {
+            boolean movedAShard;
+            do {
+                // Any time we move a shard, we need to update the cluster info and ask again for the non-preferred shards
+                // as they may have changed
+                movedAShard = false;
+                for (Iterator<ShardRouting> nonPreferredShards = nonPreferredShardIteratorFactory.createNonPreferredShardIterator(
+                    allocation
+                ); nonPreferredShards.hasNext();) {
+                    if (tryMoveShardIfNonPreferred(nonPreferredShards.next())) {
+                        movedAShard = true;
+                        break;
+                    }
+                }
+                // TODO: Update cluster info
+            } while (movedAShard);
+        }
+
+        private boolean tryMoveShardIfNonPreferred(ShardRouting shardRouting) {
+            ProjectIndex index = projectIndex(shardRouting);
+            final MoveDecision moveDecision = decideMoveNonPreferred(index, shardRouting);
+            if (moveDecision.isDecisionTaken() && moveDecision.forceMove()) {
+                final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId());
+                final ModelNode targetNode = nodes.get(moveDecision.getTargetNode().getId());
+                sourceNode.removeShard(index, shardRouting);
+                Tuple<ShardRouting, ShardRouting> relocatingShards = routingNodes.relocateShard(
+                    shardRouting,
+                    targetNode.getNodeId(),
+                    allocation.clusterInfo().getShardSize(shardRouting, ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE),
+                    "non-preferred",
+                    allocation.changes()
+                );
+                final ShardRouting shard = relocatingShards.v2();
+                targetNode.addShard(projectIndex(shard), shard);
+                if (logger.isTraceEnabled()) {
+                    logger.trace("Moved shard [{}] to node [{}]", shardRouting, targetNode.getRoutingNode());
+                }
+                return true;
+            } else if (moveDecision.isDecisionTaken() && moveDecision.canRemain() == false) {
+                logger.trace("[{}][{}] can't move", shardRouting.index(), shardRouting.id());
+            }
+            return false;
+        }
+
+        /**
+         * Makes a decision on whether to move a started shard to another node. The following rules apply
+         * to the {@link MoveDecision} return object:
+         *   1. If the shard is not started, no decision will be taken and {@link MoveDecision#isDecisionTaken()} will return false.
+         *   2. If the shard's current allocation is preferred ({@link Decision.Type#YES}), no attempt will be made to move the shard and
+         *      {@link MoveDecision#getCanRemainDecision} will have a decision type of YES. All other fields in the object will be null.
+         *   3. If the shard is not allowed ({@link Decision.Type#NO}), or not preferred ({@link Decision.Type#NOT_PREFERRED}) to remain
+         *      on its current node, then {@link MoveDecision#getAllocationDecision()} will be populated with the decision of moving to
+         *      another node. If {@link MoveDecision#forceMove()} returns {@code true}, then {@link MoveDecision#getTargetNode} will return
+         *      a non-null value representing a node that returned {@link Decision.Type#YES} from canAllocate, otherwise the assignedNodeId
+         *      will be null.
+         *   4. If the method is invoked in explain mode (e.g. from the cluster allocation explain APIs), then
+         *      {@link MoveDecision#getNodeDecisions} will have a non-null value.
+         */
+        public MoveDecision decideMoveNonPreferred(final ProjectIndex index, final ShardRouting shardRouting) {
+            NodeSorter sorter = nodeSorters.sorterForShard(shardRouting);
+            index.assertMatch(shardRouting);
+
+            if (shardRouting.started() == false) {
+                // we can only move started shards
+                return MoveDecision.NOT_TAKEN;
+            }
+
+            final ModelNode sourceNode = nodes.get(shardRouting.currentNodeId());
+            assert sourceNode != null && sourceNode.containsShard(index, shardRouting);
+            RoutingNode routingNode = sourceNode.getRoutingNode();
+            Decision canRemain = allocation.deciders().canRemain(shardRouting, routingNode, allocation);
+            if ((canRemain.type() == Type.NOT_PREFERRED || canRemain.type() == Type.NO) == false) {
+                return MoveDecision.remain(canRemain);
+            }
+
+            sorter.reset(index);
+            /*
+             * the sorter holds the minimum weight node first for the shards index.
+             * We now walk through the nodes until we find a node to allocate the shard.
+             * This is not guaranteed to be balanced after this operation we still try best effort to
+             * allocate on the minimal eligible node.
+             */
+            return decideMove(sorter, shardRouting, sourceNode, canRemain, this::decideCanAllocatePreferredOnly);
+        }
+
         /**
          * Move started shards that can not be allocated to a node anymore
          *
@@ -839,6 +946,15 @@ private MoveDecision decideMove(
             );
         }
 
+        private Decision decideCanAllocatePreferredOnly(ShardRouting shardRouting, RoutingNode target) {
+            Decision decision = allocation.deciders().canAllocate(shardRouting, target, allocation);
+            // not-preferred means no here
+            if (decision.type() == Type.NOT_PREFERRED) {
+                return Decision.NO;
+            }
+            return decision;
+        }
+
         private Decision decideCanAllocate(ShardRouting shardRouting, RoutingNode target) {
             // don't use canRebalance as we want hard filtering rules to apply. See #17698
             return allocation.deciders().canAllocate(shardRouting, target, allocation);

diff --git a/...csearch/cluster/routing/allocation/allocator/DefaultNonPreferredShardIteratorFactory.java b/...csearch/cluster/routing/allocation/allocator/DefaultNonPreferredShardIteratorFactory.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.cluster.routing.allocation.allocator;
+
+import org.elasticsearch.cluster.routing.RoutingNode;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
+import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings;
+import org.elasticsearch.threadpool.ThreadPool;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+
+/**
+ * Non-preferred shard iterator factory that returns the most desirable shards from most-hot-spotted
+ * nodes first.
+ * Does not return nodes for which we have no write-pool utilization, or shards for which we have no
+ * write-load data.
+ */
+public class DefaultNonPreferredShardIteratorFactory implements NonPreferredShardIteratorFactory {
+
+    private final WriteLoadConstraintSettings writeLoadConstraintSettings;
+
+    public DefaultNonPreferredShardIteratorFactory(WriteLoadConstraintSettings writeLoadConstraintSettings) {
+        this.writeLoadConstraintSettings = writeLoadConstraintSettings;
+    }
+
+    @Override
+    public Iterator<ShardRouting> createNonPreferredShardIterator(RoutingAllocation allocation) {
+        if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) {
+            return Collections.emptyIterator();
+        }
+        final Set<NodeShardIterable> hotSpottedNodes = new TreeSet<>(Comparator.reverseOrder());
+        final var nodeUsageStatsForThreadPools = allocation.clusterInfo().getNodeUsageStatsForThreadPools();
+        for (RoutingNode node : allocation.routingNodes()) {
+            var nodeUsageStats = nodeUsageStatsForThreadPools.get(node.nodeId());
+            if (nodeUsageStats != null) {
+                final var writeThreadPoolStats = nodeUsageStats.threadPoolUsageStatsMap().get(ThreadPool.Names.WRITE);
+                assert writeThreadPoolStats != null;
+                hotSpottedNodes.add(new NodeShardIterable(allocation, node, writeThreadPoolStats.maxThreadPoolQueueLatencyMillis()));
+            }
+        }
+        return new NodeShardIterator(hotSpottedNodes.iterator());
+    }
+
+    private static class NodeShardIterator implements Iterator<ShardRouting> {
+
+        private final Iterator<NodeShardIterable> iterator;
+        private Iterator<ShardRouting> currentShardIterator;
+
+        private NodeShardIterator(Iterator<NodeShardIterable> iterator) {
+            this.iterator = iterator;
+        }
+
+        @Override
+        public boolean hasNext() {
+            if (currentShardIterator == null || currentShardIterator.hasNext() == false) {
+                if (iterator.hasNext()) {
+                    currentShardIterator = iterator.next().iterator();
+                } else {
+                    return false;
+                }
+            }
+            return currentShardIterator.hasNext();
+        }
+
+        @Override
+        public ShardRouting next() {
+            if (currentShardIterator == null) {
+                currentShardIterator = iterator.next().iterator();
+            }
+            return currentShardIterator.next();
+        }
+    }
+
+    private static class NodeShardIterable implements Iterable<ShardRouting>, Comparable<NodeShardIterable> {
+
+        private final RoutingAllocation allocation;
+        private final RoutingNode routingNode;
+        private final long maxQueueLatencyMillis;
+
+        private NodeShardIterable(RoutingAllocation allocation, RoutingNode routingNode, long maxQueueLatencyMillis) {
+            this.allocation = allocation;
+            this.routingNode = routingNode;
+            this.maxQueueLatencyMillis = maxQueueLatencyMillis;
+        }
+
+        @Override
+        public Iterator<ShardRouting> iterator() {
+            return createShardIterator();
+        }
+
+        @Override
+        public int compareTo(NodeShardIterable o) {
+            return Long.compare(maxQueueLatencyMillis, o.maxQueueLatencyMillis);
+        }
+
+        private Iterator<ShardRouting> createShardIterator() {
+            final var shardWriteLoads = allocation.clusterInfo().getShardWriteLoads();
+            final List<ShardRouting> sortedRoutings = new ArrayList<>();
+            double totalWriteLoad = 0;
+            for (ShardRouting shard : routingNode) {
+                Double shardWriteLoad = shardWriteLoads.get(shard.shardId());
+                if (shardWriteLoad != null) {
+                    sortedRoutings.add(shard);
+                    totalWriteLoad += shardWriteLoad;
+                }
+            }
+            // TODO: Work out what this order should be
+            // Sort by distance-from-mean-write-load
+            double meanWriteLoad = totalWriteLoad / sortedRoutings.size();
+            sortedRoutings.sort(Comparator.comparing(sr -> Math.abs(shardWriteLoads.get(sr.shardId()) - meanWriteLoad)));
+            return sortedRoutings.iterator();
+        }
+    }
+}