diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java index d497270950224..c0d8bd4e06e48 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java @@ -21,6 +21,7 @@ import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.core.TimeValue; import org.elasticsearch.gateway.GatewayService; import org.elasticsearch.threadpool.ThreadPool; @@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor( public void onNewInfo(ClusterInfo clusterInfo) { final ClusterState state = clusterStateSupplier.get(); if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) { - logger.debug("skipping monitor as the cluster state is not recovered yet"); + logger.trace("skipping monitor as the cluster state is not recovered yet"); return; } if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) { - logger.debug("skipping monitor because the write load decider is not fully enabled"); + logger.trace("skipping monitor because the write load decider is not fully enabled"); return; } @@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) { }); if (nodeIdsExceedingLatencyThreshold.isEmpty()) { - logger.debug("No hot-spotting nodes detected"); + logger.trace("No hot-spotting nodes detected"); return; } @@ -94,12 +95,22 @@ public void onNewInfo(ClusterInfo clusterInfo) { final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval() .millis(); + // We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots + // or hot-spots that are persisting and need further balancing work. if (haveCalledRerouteRecently == false || Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) { if (logger.isDebugEnabled()) { logger.debug( - "Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute", + """ + Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \ + Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute. + """, nodeSummary(nodeIdsExceedingLatencyThreshold), + state.nodes().size(), + lastRerouteTimeMillis == 0 + ? "has never previously been called" + : "was last called [" + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + "] ago", + nodeSummary(lastSetOfHotSpottedNodes), state.nodes().size() ); } @@ -115,7 +126,10 @@ public void onNewInfo(ClusterInfo clusterInfo) { lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong(); lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold; } else { - logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots"); + logger.debug( + "Not calling reroute because we called reroute [{}] ago and there are no new hot spots", + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + ); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java index fea79d92c9fbf..210ca90750e5c 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java @@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenStateIsNotRecovered() { @@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() { new MockLog.SeenEventExpectation( "don't reroute due to global block", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "skipping monitor as the cluster state is not recovered yet" ) ); @@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { @@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { new MockLog.SeenEventExpectation( "don't reroute due to decider being disabled", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "skipping monitor because the write load decider is not fully enabled" ) ); @@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() { @@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() { new MockLog.SeenEventExpectation( "don't reroute due to no nodes hot-spotting", WriteLoadConstraintMonitor.class.getCanonicalName(), - Level.DEBUG, + Level.TRACE, "No hot-spotting nodes detected" ) ); @@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() { "don't reroute due to reroute being called recently", WriteLoadConstraintMonitor.class.getCanonicalName(), Level.DEBUG, - "Not calling reroute because we called reroute recently and there are no new hot spots" + "Not calling reroute because we called reroute * ago and there are no new hot spots" ) ); writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo); @@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() { } @TestLogging( - value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG", + value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE", reason = "ensure we're skipping reroute for the right reason" ) public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() {