phananh1010
diff --git a/‎server/src/internalClusterTest/java/org/elasticsearch/cluster/allocation/WriteLoadConstraintMonitorIT.java‎
Lines changed: 6 additions & 4 deletions b/‎server/src/internalClusterTest/java/org/elasticsearch/cluster/allocation/WriteLoadConstraintMonitorIT.java‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java‎
Lines changed: 24 additions & 8 deletions b/‎server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java‎
Lines changed: 24 additions & 8 deletions
@@ -51,6 +51,8 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
         internalCluster().startMasterOnlyNode(settings);
         final String dataNodeOne = internalCluster().startDataOnlyNode(settings);
         final String dataNodeTwo = internalCluster().startDataOnlyNode(settings);
+        // Maintain a third node so that there's always at least one non-hot-spotting node that can receive shards.
+        internalCluster().startDataOnlyNode(settings);
 
         // Unmodified cluster info should detect no hot-spotting nodes
         MockLog.awaitLogger(
@@ -60,7 +62,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
                 "no hot-spots detected",
                 WriteLoadConstraintMonitor.class.getCanonicalName(),
                 Level.TRACE,
-                "No hot-spotting nodes detected"
+                "No hot-spotting write nodes detected"
             )
         );
 
@@ -76,7 +78,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
                 WriteLoadConstraintMonitor.class.getCanonicalName(),
                 Level.DEBUG,
                 Strings.format("""
-                    Nodes [[%s]] are hot-spotting, of 3 total cluster nodes. Reroute for hot-spotting has never previously been called. \
+                    Nodes [[%s]] are hot-spotting, of 4 total cluster nodes. Reroute for hot-spotting has never previously been called. \
                     Previously hot-spotting nodes are [0 nodes]. The write thread pool queue latency threshold is [%s]. \
                     Triggering reroute.
                     """, getNodeId(dataNodeOne), TimeValue.timeValueMillis(queueLatencyThresholdMillis))
@@ -111,7 +113,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
                 WriteLoadConstraintMonitor.class.getCanonicalName(),
                 Level.DEBUG,
                 Strings.format("""
-                    Nodes [[*]] are hot-spotting, of 3 total cluster nodes. \
+                    Nodes [[*]] are hot-spotting, of 4 total cluster nodes. \
                     Reroute for hot-spotting was last called [*] ago. Previously hot-spotting nodes are [[%s]]. \
                     The write thread pool queue latency threshold is [%s]. Triggering reroute.
                     """, getNodeId(dataNodeOne), TimeValue.timeValueMillis(queueLatencyThresholdMillis))
@@ -130,7 +132,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
                 "no hot-spots detected",
                 WriteLoadConstraintMonitor.class.getCanonicalName(),
                 Level.TRACE,
-                "No hot-spotting nodes detected"
+                "No hot-spotting write nodes detected"
             )
         );
     }
 
@@ -16,6 +16,7 @@
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.NodeUsageStatsForThreadPools;
+import org.elasticsearch.cluster.node.DiscoveryNodeRole;
 import org.elasticsearch.cluster.routing.RerouteService;
 import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.Strings;
@@ -26,6 +27,7 @@
 import org.elasticsearch.threadpool.ThreadPool;
 
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.LongSupplier;
 import java.util.function.Supplier;
 
@@ -75,18 +77,32 @@ public void onNewInfo(ClusterInfo clusterInfo) {
         logger.trace("processing new cluster info");
 
         final int numberOfNodes = clusterInfo.getNodeUsageStatsForThreadPools().size();
-        final Set<String> nodeIdsExceedingLatencyThreshold = Sets.newHashSetWithExpectedSize(numberOfNodes);
+        final Set<String> writeNodeIdsExceedingQueueLatencyThreshold = Sets.newHashSetWithExpectedSize(numberOfNodes);
+        AtomicBoolean haveWriteNodesBelowQueueLatencyThreshold = new AtomicBoolean(false);
         clusterInfo.getNodeUsageStatsForThreadPools().forEach((nodeId, usageStats) -> {
+            if (state.getNodes().get(nodeId).getRoles().contains(DiscoveryNodeRole.SEARCH_ROLE)) {
+                // Search nodes are not expected to have write load hot-spots and are not considered for shard relocation.
+                // TODO (ES-13314): consider stateful data tiers
+                return;
+            }
             final NodeUsageStatsForThreadPools.ThreadPoolUsageStats writeThreadPoolStats = usageStats.threadPoolUsageStatsMap()
                 .get(ThreadPool.Names.WRITE);
             assert writeThreadPoolStats != null : "Write thread pool is not publishing usage stats for node [" + nodeId + "]";
-            if (writeThreadPoolStats.maxThreadPoolQueueLatencyMillis() > writeLoadConstraintSettings.getQueueLatencyThreshold().millis()) {
-                nodeIdsExceedingLatencyThreshold.add(nodeId);
+            if (writeThreadPoolStats.maxThreadPoolQueueLatencyMillis() >= writeLoadConstraintSettings.getQueueLatencyThreshold().millis()) {
+                writeNodeIdsExceedingQueueLatencyThreshold.add(nodeId);
+            } else {
+                haveWriteNodesBelowQueueLatencyThreshold.set(true);
             }
         });
 
-        if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
-            logger.trace("No hot-spotting nodes detected");
+        if (writeNodeIdsExceedingQueueLatencyThreshold.isEmpty()) {
+            logger.trace("No hot-spotting write nodes detected");
+            return;
+        }
+        if (haveWriteNodesBelowQueueLatencyThreshold.get() == false) {
+            logger.debug("""
+                Nodes [{}] are above the queue latency threshold, but there are no write nodes below the threshold. \
+                Cannot rebalance shards.""", nodeSummary(writeNodeIdsExceedingQueueLatencyThreshold));
             return;
         }
 
@@ -98,14 +114,14 @@ public void onNewInfo(ClusterInfo clusterInfo) {
         // We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots
         // or hot-spots that are persisting and need further balancing work.
         if (haveCalledRerouteRecently == false
-            || Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
+            || Sets.difference(writeNodeIdsExceedingQueueLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
             if (logger.isDebugEnabled()) {
                 logger.debug(
                     """
                         Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \
                         Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute.
                         """,
-                    nodeSummary(nodeIdsExceedingLatencyThreshold),
+                    nodeSummary(writeNodeIdsExceedingQueueLatencyThreshold),
                     state.nodes().size(),
                     lastRerouteTimeMillis == 0
                         ? "has never previously been called"
@@ -124,7 +140,7 @@ public void onNewInfo(ClusterInfo clusterInfo) {
                 )
             );
             lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
-            lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
+            lastSetOfHotSpottedNodes = writeNodeIdsExceedingQueueLatencyThreshold;
         } else {
             logger.debug(
                 "Not calling reroute because we called reroute [{}] ago and there are no new hot spots",