From 5db528695e24a1e26aa550d648f8c800c740b47e Mon Sep 17 00:00:00 2001 From: Dianna Hohensee Date: Tue, 7 Oct 2025 17:05:50 -0400 Subject: [PATCH 1/2] Update write load monitor log-levels Change the log-levels in the monitor to use debug for activity and trace for no activity. This supports enabling DEBUG logging in production to show when the write load logic activates. Closes ES-13139 --- .../WriteLoadConstraintMonitor.java | 22 ++++++++++++++----- .../WriteLoadConstraintMonitorTests.java | 16 +++++++------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java index d497270950224..6bd1c5d79eb0d 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java @@ -21,6 +21,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.core.TimeValue; import org.elasticsearch.gateway.GatewayService; import org.elasticsearch.threadpool.ThreadPool; @@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor( public void onNewInfo(ClusterInfo clusterInfo) { final ClusterState state = clusterStateSupplier.get(); if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) { - logger.debug("skipping monitor as the cluster state is not recovered yet"); + logger.trace("skipping monitor as the cluster state is not recovered yet"); return; } if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) { - logger.debug("skipping monitor because the write load decider is not fully enabled"); + logger.trace("skipping monitor because the write load decider is not fully enabled"); return; } @@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) { }); if (nodeIdsExceedingLatencyThreshold.isEmpty()) { - logger.debug("No hot-spotting nodes detected"); + logger.trace("No hot-spotting nodes detected"); return; } @@ -94,12 +95,20 @@ public void onNewInfo(ClusterInfo clusterInfo) { final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval() .millis(); + // We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots + // or hot-spots that are persisting and need further balancing work. if (haveCalledRerouteRecently == false || Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) { if (logger.isDebugEnabled()) { logger.debug( - "Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute", + """ + Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting was last called [{}] ago. \ + Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute. + """, nodeSummary(nodeIdsExceedingLatencyThreshold), + state.nodes().size(), + lastRerouteTimeMillis == 0 ? "never" : TimeValue.timeValueMillis(timeSinceLastRerouteMillis), + nodeSummary(lastSetOfHotSpottedNodes), state.nodes().size() ); } @@ -115,7 +124,10 @@ public void onNewInfo(ClusterInfo clusterInfo) { lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong(); lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold; } else { - logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots"); + logger.debug( + "Not calling reroute because we called reroute [{}] ago and there are no new hot spots", + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + ); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java index fea79d92c9fbf..210ca90750e5c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java @@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenStateIsNotRecovered() { @@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() { new MockLog.SeenEventExpectation( "don't reroute due to global block", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "skipping monitor as the cluster state is not recovered yet" ) ); @@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { @@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { new MockLog.SeenEventExpectation( "don't reroute due to decider being disabled", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "skipping monitor because the write load decider is not fully enabled" ) ); @@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() { @@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() { new MockLog.SeenEventExpectation( "don't reroute due to no nodes hot-spotting", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "No hot-spotting nodes detected" ) ); @@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() { "don't reroute due to reroute being called recently", WriteLoadConstraintMonitor.class.getCanonicalName(), Level.DEBUG, - "Not calling reroute because we called reroute recently and there are no new hot spots" + "Not calling reroute because we called reroute * ago and there are no new hot spots" ) ); writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo); @@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() { From 14661de6fd89db9f48d275d86208278a41712967 Mon Sep 17 00:00:00 2001 From: Dianna Hohensee Date: Wed, 8 Oct 2025 11:32:16 -0400 Subject: [PATCH 2/2] improve log message --- .../routing/allocation/WriteLoadConstraintMonitor.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java index 6bd1c5d79eb0d..c0d8bd4e06e48 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java @@ -102,12 +102,14 @@ public void onNewInfo(ClusterInfo clusterInfo) { if (logger.isDebugEnabled()) { logger.debug( """ - Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting was last called [{}] ago. \ + Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \ Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute. """, nodeSummary(nodeIdsExceedingLatencyThreshold), state.nodes().size(), - lastRerouteTimeMillis == 0 ? "never" : TimeValue.timeValueMillis(timeSinceLastRerouteMillis), + lastRerouteTimeMillis == 0 + ? "has never previously been called" + : "was last called [" + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + "] ago", nodeSummary(lastSetOfHotSpottedNodes), state.nodes().size() );