Update write load monitor log-levels (#136137)

DiannaHohensee · web-flow · commit b7237d32ab1d · 2025-10-08T12:42:55.000-04:00
Change the log-levels in the monitor to use debug for activity and
trace for no activity. This supports enabling DEBUG logging in
production to show when the write load logic activates.

Closes ES-13139
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java
@@ -21,6 +21,7 @@
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.ClusterSettings;
 import org.elasticsearch.common.util.set.Sets;
+import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.gateway.GatewayService;
 import org.elasticsearch.threadpool.ThreadPool;
 
@@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor(
     public void onNewInfo(ClusterInfo clusterInfo) {
         final ClusterState state = clusterStateSupplier.get();
         if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
-            logger.debug("skipping monitor as the cluster state is not recovered yet");
+            logger.trace("skipping monitor as the cluster state is not recovered yet");
             return;
         }
 
         if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) {
-            logger.debug("skipping monitor because the write load decider is not fully enabled");
+            logger.trace("skipping monitor because the write load decider is not fully enabled");
             return;
         }
 
@@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) {
         });
 
         if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
-            logger.debug("No hot-spotting nodes detected");
+            logger.trace("No hot-spotting nodes detected");
             return;
         }
 
@@ -94,12 +95,22 @@ public void onNewInfo(ClusterInfo clusterInfo) {
         final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval()
             .millis();
 
+        // We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots
+        // or hot-spots that are persisting and need further balancing work.
         if (haveCalledRerouteRecently == false
             || Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
             if (logger.isDebugEnabled()) {
                 logger.debug(
-                    "Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute",
+                    """
+                        Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \
+                        Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute.
+                        """,
                     nodeSummary(nodeIdsExceedingLatencyThreshold),
+                    state.nodes().size(),
+                    lastRerouteTimeMillis == 0
+                        ? "has never previously been called"
+                        : "was last called [" + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + "] ago",
+                    nodeSummary(lastSetOfHotSpottedNodes),
                     state.nodes().size()
                 );
             }
@@ -115,7 +126,10 @@ public void onNewInfo(ClusterInfo clusterInfo) {
             lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
             lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
         } else {
-            logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots");
+            logger.debug(
+                "Not calling reroute because we called reroute [{}] ago and there are no new hot spots",
+                TimeValue.timeValueMillis(timeSinceLastRerouteMillis)
+            );
         }
     }
 
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java
@@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() {
     }
 
     @TestLogging(
-        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
+        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
         reason = "ensure we're skipping reroute for the right reason"
     )
     public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
@@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
                 new MockLog.SeenEventExpectation(
                     "don't reroute due to global block",
                     WriteLoadConstraintMonitor.class.getCanonicalName(),
-                    Level.DEBUG,
+                    Level.TRACE,
                     "skipping monitor as the cluster state is not recovered yet"
                 )
             );
@@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
     }
 
     @TestLogging(
-        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
+        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
         reason = "ensure we're skipping reroute for the right reason"
     )
     public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
@@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
                 new MockLog.SeenEventExpectation(
                     "don't reroute due to decider being disabled",
                     WriteLoadConstraintMonitor.class.getCanonicalName(),
-                    Level.DEBUG,
+                    Level.TRACE,
                     "skipping monitor because the write load decider is not fully enabled"
                 )
             );
@@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
     }
 
     @TestLogging(
-        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
+        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
         reason = "ensure we're skipping reroute for the right reason"
     )
     public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
@@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
                 new MockLog.SeenEventExpectation(
                     "don't reroute due to no nodes hot-spotting",
                     WriteLoadConstraintMonitor.class.getCanonicalName(),
-                    Level.DEBUG,
+                    Level.TRACE,
                     "No hot-spotting nodes detected"
                 )
             );
@@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
                         "don't reroute due to reroute being called recently",
                         WriteLoadConstraintMonitor.class.getCanonicalName(),
                         Level.DEBUG,
-                        "Not calling reroute because we called reroute recently and there are no new hot spots"
+                        "Not calling reroute because we called reroute * ago and there are no new hot spots"
                     )
                 );
                 writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo);
@@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
     }
 
     @TestLogging(
-        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
+        value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
         reason = "ensure we're skipping reroute for the right reason"
     )
     public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() {

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() {`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`@TestLogging(`
`65`		`- value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",`
	`65`	`+ value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",`
`66`	`66`	`reason = "ensure we're skipping reroute for the right reason"`
`67`	`67`	`)`
`68`	`68`	`public void testRerouteIsNotCalledWhenStateIsNotRecovered() {`
`@@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {`
`81`	`81`	`new MockLog.SeenEventExpectation(`
`82`	`82`	`"don't reroute due to global block",`
`83`	`83`	`WriteLoadConstraintMonitor.class.getCanonicalName(),`
`84`		`- Level.DEBUG,`
	`84`	`+ Level.TRACE,`
`85`	`85`	`"skipping monitor as the cluster state is not recovered yet"`
`86`	`86`	`)`
`87`	`87`	`);`
`@@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {`
`93`	`93`	`}`
`94`	`94`
`95`	`95`	`@TestLogging(`
`96`		`- value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",`
	`96`	`+ value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",`
`97`	`97`	`reason = "ensure we're skipping reroute for the right reason"`
`98`	`98`	`)`
`99`	`99`	`public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {`
`@@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {`
`117`	`117`	`new MockLog.SeenEventExpectation(`
`118`	`118`	`"don't reroute due to decider being disabled",`
`119`	`119`	`WriteLoadConstraintMonitor.class.getCanonicalName(),`
`120`		`- Level.DEBUG,`
	`120`	`+ Level.TRACE,`
`121`	`121`	`"skipping monitor because the write load decider is not fully enabled"`
`122`	`122`	`)`
`123`	`123`	`);`
`@@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {`
`129`	`129`	`}`
`130`	`130`
`131`	`131`	`@TestLogging(`
`132`		`- value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",`
	`132`	`+ value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",`
`133`	`133`	`reason = "ensure we're skipping reroute for the right reason"`
`134`	`134`	`)`
`135`	`135`	`public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {`
`@@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {`
`146`	`146`	`new MockLog.SeenEventExpectation(`
`147`	`147`	`"don't reroute due to no nodes hot-spotting",`
`148`	`148`	`WriteLoadConstraintMonitor.class.getCanonicalName(),`
`149`		`- Level.DEBUG,`
	`149`	`+ Level.TRACE,`
`150`	`150`	`"No hot-spotting nodes detected"`
`151`	`151`	`)`
`152`	`152`	`);`
`@@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {`
`196`	`196`	`"don't reroute due to reroute being called recently",`
`197`	`197`	`WriteLoadConstraintMonitor.class.getCanonicalName(),`
`198`	`198`	`Level.DEBUG,`
`199`		`- "Not calling reroute because we called reroute recently and there are no new hot spots"`
	`199`	`+ "Not calling reroute because we called reroute * ago and there are no new hot spots"`
`200`	`200`	`)`
`201`	`201`	`);`
`202`	`202`	`writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo);`
`@@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {`
`213`	`213`	`}`
`214`	`214`
`215`	`215`	`@TestLogging(`
`216`		`- value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",`
	`216`	`+ value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",`
`217`	`217`	`reason = "ensure we're skipping reroute for the right reason"`
`218`	`218`	`)`
`219`	`219`	`public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() {`