Implement WriteLoadConstraintMonitor (#132917)

nicktindall · web-flow · commit 31e3c555cf3f · 2025-08-29T11:37:28.000+10:00
diff --git a/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPools.java b/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPools.java
@@ -89,33 +89,5 @@ public void writeTo(StreamOutput out) throws IOException {
             out.writeFloat(this.averageThreadPoolUtilization);
             out.writeVLong(this.maxThreadPoolQueueLatencyMillis);
         }
-
-        @Override
-        public int hashCode() {
-            return Objects.hash(totalThreadPoolThreads, averageThreadPoolUtilization, maxThreadPoolQueueLatencyMillis);
-        }
-
-        @Override
-        public String toString() {
-            return "[totalThreadPoolThreads="
-                + totalThreadPoolThreads
-                + ", averageThreadPoolUtilization="
-                + averageThreadPoolUtilization
-                + ", maxThreadPoolQueueLatencyMillis="
-                + maxThreadPoolQueueLatencyMillis
-                + "]";
-        }
-
-        @Override
-        public boolean equals(Object o) {
-            if (this == o) return true;
-            if (o == null || getClass() != o.getClass()) return false;
-            ThreadPoolUsageStats other = (ThreadPoolUsageStats) o;
-            return totalThreadPoolThreads == other.totalThreadPoolThreads
-                && averageThreadPoolUtilization == other.averageThreadPoolUtilization
-                && maxThreadPoolQueueLatencyMillis == other.maxThreadPoolQueueLatencyMillis;
-        }
-
-    } // ThreadPoolUsageStats
-
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java
@@ -15,26 +15,32 @@
 import org.elasticsearch.cluster.ClusterInfo;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.NodeUsageStatsForThreadPools;
 import org.elasticsearch.cluster.routing.RerouteService;
 import org.elasticsearch.common.Priority;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.ClusterSettings;
+import org.elasticsearch.common.util.set.Sets;
 import org.elasticsearch.gateway.GatewayService;
+import org.elasticsearch.threadpool.ThreadPool;
 
+import java.util.Set;
 import java.util.function.LongSupplier;
 import java.util.function.Supplier;
 
 /**
- * Monitors the node-level write thread pool usage across the cluster and initiates (coming soon) a rebalancing round (via
+ * Monitors the node-level write thread pool usage across the cluster and initiates a rebalancing round (via
  * {@link RerouteService#reroute}) whenever a node crosses the node-level write load thresholds.
- *
- * TODO (ES-11992): implement
  */
 public class WriteLoadConstraintMonitor {
     private static final Logger logger = LogManager.getLogger(WriteLoadConstraintMonitor.class);
+    private static final int MAX_NODE_IDS_IN_MESSAGE = 3;
     private final WriteLoadConstraintSettings writeLoadConstraintSettings;
     private final Supplier<ClusterState> clusterStateSupplier;
     private final LongSupplier currentTimeMillisSupplier;
     private final RerouteService rerouteService;
+    private volatile long lastRerouteTimeMillis = 0;
+    private volatile Set<String> lastSetOfHotSpottedNodes = Set.of();
 
     public WriteLoadConstraintMonitor(
         ClusterSettings clusterSettings,
@@ -60,29 +66,64 @@ public void onNewInfo(ClusterInfo clusterInfo) {
             return;
         }
 
-        if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled() == WriteLoadConstraintSettings.WriteLoadDeciderStatus.DISABLED) {
-            logger.trace("skipping monitor because the write load decider is disabled");
+        if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) {
+            logger.debug("skipping monitor because the write load decider is not fully enabled");
             return;
         }
 
         logger.trace("processing new cluster info");
 
-        boolean reroute = false;
-        String explanation = "";
-        final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
+        final int numberOfNodes = clusterInfo.getNodeUsageStatsForThreadPools().size();
+        final Set<String> nodeIdsExceedingLatencyThreshold = Sets.newHashSetWithExpectedSize(numberOfNodes);
+        clusterInfo.getNodeUsageStatsForThreadPools().forEach((nodeId, usageStats) -> {
+            final NodeUsageStatsForThreadPools.ThreadPoolUsageStats writeThreadPoolStats = usageStats.threadPoolUsageStatsMap()
+                .get(ThreadPool.Names.WRITE);
+            assert writeThreadPoolStats != null : "Write thread pool is not publishing usage stats for node [" + nodeId + "]";
+            if (writeThreadPoolStats.maxThreadPoolQueueLatencyMillis() > writeLoadConstraintSettings.getQueueLatencyThreshold().millis()) {
+                nodeIdsExceedingLatencyThreshold.add(nodeId);
+            }
+        });
 
-        // TODO (ES-11992): implement
+        if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
+            logger.debug("No hot-spotting nodes detected");
+            return;
+        }
 
-        if (reroute) {
-            logger.debug("rerouting shards: [{}]", explanation);
-            rerouteService.reroute("disk threshold monitor", Priority.NORMAL, ActionListener.wrap(ignored -> {
-                final var reroutedClusterState = clusterStateSupplier.get();
+        final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
+        final long timeSinceLastRerouteMillis = currentTimeMillis - lastRerouteTimeMillis;
+        final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval()
+            .millis();
 
-                // TODO (ES-11992): implement
+        if (haveCalledRerouteRecently == false
+            || Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
+            if (logger.isDebugEnabled()) {
+                logger.debug(
+                    "Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute",
+                    nodeSummary(nodeIdsExceedingLatencyThreshold),
+                    state.nodes().size()
+                );
+            }
+            final String reason = "hot-spotting detected by write load constraint monitor";
+            rerouteService.reroute(
+                reason,
+                Priority.NORMAL,
+                ActionListener.wrap(
+                    ignored -> logger.trace("{} reroute successful", reason),
+                    e -> logger.debug(() -> Strings.format("reroute failed, reason: %s", reason), e)
+                )
+            );
+            lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
+            lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
+        } else {
+            logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots");
+        }
+    }
 
-            }, e -> logger.debug("reroute failed", e)));
+    private static String nodeSummary(Set<String> nodeIds) {
+        if (nodeIds.isEmpty() == false && nodeIds.size() <= MAX_NODE_IDS_IN_MESSAGE) {
+            return "[" + String.join(", ", nodeIds) + "]";
         } else {
-            logger.trace("no reroute required");
+            return nodeIds.size() + " nodes";
         }
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintSettings.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintSettings.java
@@ -107,41 +107,40 @@ public boolean disabled() {
         Setting.Property.NodeScope
     );
 
-    WriteLoadDeciderStatus writeLoadDeciderStatus;
-    TimeValue writeLoadDeciderRerouteIntervalSetting;
-    double writeThreadPoolHighUtilizationThresholdSetting;
+    private volatile WriteLoadDeciderStatus writeLoadDeciderStatus;
+    private volatile TimeValue minimumRerouteInterval;
+    private volatile double highUtilizationThreshold;
+    private volatile TimeValue queueLatencyThreshold;
 
     public WriteLoadConstraintSettings(ClusterSettings clusterSettings) {
-        clusterSettings.initializeAndWatch(WRITE_LOAD_DECIDER_ENABLED_SETTING, this::setWriteLoadConstraintEnabled);
-        clusterSettings.initializeAndWatch(WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING, this::setWriteLoadDeciderRerouteIntervalSetting);
+        clusterSettings.initializeAndWatch(WRITE_LOAD_DECIDER_ENABLED_SETTING, status -> this.writeLoadDeciderStatus = status);
+        clusterSettings.initializeAndWatch(
+            WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING,
+            timeValue -> this.minimumRerouteInterval = timeValue
+        );
         clusterSettings.initializeAndWatch(
             WRITE_LOAD_DECIDER_HIGH_UTILIZATION_THRESHOLD_SETTING,
-            this::setWriteThreadPoolHighUtilizationThresholdSetting
+            value -> highUtilizationThreshold = value.getAsRatio()
         );
-
-    };
-
-    private void setWriteLoadConstraintEnabled(WriteLoadDeciderStatus status) {
-        this.writeLoadDeciderStatus = status;
+        clusterSettings.initializeAndWatch(WRITE_LOAD_DECIDER_QUEUE_LATENCY_THRESHOLD_SETTING, value -> queueLatencyThreshold = value);
     }
 
     public WriteLoadDeciderStatus getWriteLoadConstraintEnabled() {
         return this.writeLoadDeciderStatus;
     }
 
-    public TimeValue getWriteLoadDeciderRerouteIntervalSetting() {
-        return this.writeLoadDeciderRerouteIntervalSetting;
+    public TimeValue getMinimumRerouteInterval() {
+        return this.minimumRerouteInterval;
     }
 
-    public double getWriteThreadPoolHighUtilizationThresholdSetting() {
-        return this.writeThreadPoolHighUtilizationThresholdSetting;
+    public TimeValue getQueueLatencyThreshold() {
+        return this.queueLatencyThreshold;
     }
 
-    private void setWriteLoadDeciderRerouteIntervalSetting(TimeValue timeValue) {
-        this.writeLoadDeciderRerouteIntervalSetting = timeValue;
-    }
-
-    private void setWriteThreadPoolHighUtilizationThresholdSetting(RatioValue percent) {
-        this.writeThreadPoolHighUtilizationThresholdSetting = percent.getAsRatio();
+    /**
+     * @return The threshold as a ratio - i.e. in [0, 1]
+     */
+    public double getHighUtilizationThreshold() {
+        return this.highUtilizationThreshold;
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDecider.java
@@ -61,7 +61,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
         assert nodeUsageStatsForThreadPools.threadPoolUsageStatsMap().isEmpty() == false;
         assert nodeUsageStatsForThreadPools.threadPoolUsageStatsMap().get(ThreadPool.Names.WRITE) != null;
         var nodeWriteThreadPoolStats = nodeUsageStatsForThreadPools.threadPoolUsageStatsMap().get(ThreadPool.Names.WRITE);
-        var nodeWriteThreadPoolLoadThreshold = writeLoadConstraintSettings.getWriteThreadPoolHighUtilizationThresholdSetting();
+        var nodeWriteThreadPoolLoadThreshold = writeLoadConstraintSettings.getHighUtilizationThreshold();
         if (nodeWriteThreadPoolStats.averageThreadPoolUtilization() >= nodeWriteThreadPoolLoadThreshold) {
             // The node's write thread pool usage stats already show high utilization above the threshold for accepting new shards.
             String explain = Strings.format(
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java