Skip to content

Commit b7237d3

Browse files
Update write load monitor log-levels (#136137)
Change the log-levels in the monitor to use debug for activity and trace for no activity. This supports enabling DEBUG logging in production to show when the write load logic activates. Closes ES-13139
1 parent f32d7b6 commit b7237d3

File tree

2 files changed

+27
-13
lines changed

2 files changed

+27
-13
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.elasticsearch.common.Strings;
2222
import org.elasticsearch.common.settings.ClusterSettings;
2323
import org.elasticsearch.common.util.set.Sets;
24+
import org.elasticsearch.core.TimeValue;
2425
import org.elasticsearch.gateway.GatewayService;
2526
import org.elasticsearch.threadpool.ThreadPool;
2627

@@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor(
6263
public void onNewInfo(ClusterInfo clusterInfo) {
6364
final ClusterState state = clusterStateSupplier.get();
6465
if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
65-
logger.debug("skipping monitor as the cluster state is not recovered yet");
66+
logger.trace("skipping monitor as the cluster state is not recovered yet");
6667
return;
6768
}
6869

6970
if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) {
70-
logger.debug("skipping monitor because the write load decider is not fully enabled");
71+
logger.trace("skipping monitor because the write load decider is not fully enabled");
7172
return;
7273
}
7374

@@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) {
8586
});
8687

8788
if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
88-
logger.debug("No hot-spotting nodes detected");
89+
logger.trace("No hot-spotting nodes detected");
8990
return;
9091
}
9192

@@ -94,12 +95,22 @@ public void onNewInfo(ClusterInfo clusterInfo) {
9495
final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval()
9596
.millis();
9697

98+
// We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots
99+
// or hot-spots that are persisting and need further balancing work.
97100
if (haveCalledRerouteRecently == false
98101
|| Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
99102
if (logger.isDebugEnabled()) {
100103
logger.debug(
101-
"Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute",
104+
"""
105+
Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \
106+
Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute.
107+
""",
102108
nodeSummary(nodeIdsExceedingLatencyThreshold),
109+
state.nodes().size(),
110+
lastRerouteTimeMillis == 0
111+
? "has never previously been called"
112+
: "was last called [" + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + "] ago",
113+
nodeSummary(lastSetOfHotSpottedNodes),
103114
state.nodes().size()
104115
);
105116
}
@@ -115,7 +126,10 @@ public void onNewInfo(ClusterInfo clusterInfo) {
115126
lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
116127
lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
117128
} else {
118-
logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots");
129+
logger.debug(
130+
"Not calling reroute because we called reroute [{}] ago and there are no new hot spots",
131+
TimeValue.timeValueMillis(timeSinceLastRerouteMillis)
132+
);
119133
}
120134
}
121135

server/src/test/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitorTests.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() {
6262
}
6363

6464
@TestLogging(
65-
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
65+
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
6666
reason = "ensure we're skipping reroute for the right reason"
6767
)
6868
public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
@@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
8181
new MockLog.SeenEventExpectation(
8282
"don't reroute due to global block",
8383
WriteLoadConstraintMonitor.class.getCanonicalName(),
84-
Level.DEBUG,
84+
Level.TRACE,
8585
"skipping monitor as the cluster state is not recovered yet"
8686
)
8787
);
@@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
9393
}
9494

9595
@TestLogging(
96-
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
96+
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
9797
reason = "ensure we're skipping reroute for the right reason"
9898
)
9999
public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
@@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
117117
new MockLog.SeenEventExpectation(
118118
"don't reroute due to decider being disabled",
119119
WriteLoadConstraintMonitor.class.getCanonicalName(),
120-
Level.DEBUG,
120+
Level.TRACE,
121121
"skipping monitor because the write load decider is not fully enabled"
122122
)
123123
);
@@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
129129
}
130130

131131
@TestLogging(
132-
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
132+
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
133133
reason = "ensure we're skipping reroute for the right reason"
134134
)
135135
public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
@@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
146146
new MockLog.SeenEventExpectation(
147147
"don't reroute due to no nodes hot-spotting",
148148
WriteLoadConstraintMonitor.class.getCanonicalName(),
149-
Level.DEBUG,
149+
Level.TRACE,
150150
"No hot-spotting nodes detected"
151151
)
152152
);
@@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
196196
"don't reroute due to reroute being called recently",
197197
WriteLoadConstraintMonitor.class.getCanonicalName(),
198198
Level.DEBUG,
199-
"Not calling reroute because we called reroute recently and there are no new hot spots"
199+
"Not calling reroute because we called reroute * ago and there are no new hot spots"
200200
)
201201
);
202202
writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo);
@@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
213213
}
214214

215215
@TestLogging(
216-
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
216+
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
217217
reason = "ensure we're skipping reroute for the right reason"
218218
)
219219
public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() {

0 commit comments

Comments
 (0)