Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.threadpool.ThreadPool;

Expand Down Expand Up @@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor(
public void onNewInfo(ClusterInfo clusterInfo) {
final ClusterState state = clusterStateSupplier.get();
if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
logger.debug("skipping monitor as the cluster state is not recovered yet");
logger.trace("skipping monitor as the cluster state is not recovered yet");
return;
}

if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) {
logger.debug("skipping monitor because the write load decider is not fully enabled");
logger.trace("skipping monitor because the write load decider is not fully enabled");
return;
}

Expand All @@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) {
});

if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
logger.debug("No hot-spotting nodes detected");
logger.trace("No hot-spotting nodes detected");
return;
}

Expand All @@ -94,12 +95,22 @@ public void onNewInfo(ClusterInfo clusterInfo) {
final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval()
.millis();

// We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots
// or hot-spots that are persisting and need further balancing work.
if (haveCalledRerouteRecently == false
|| Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
if (logger.isDebugEnabled()) {
logger.debug(
"Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute",
"""
Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \
Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute.
""",
nodeSummary(nodeIdsExceedingLatencyThreshold),
state.nodes().size(),
lastRerouteTimeMillis == 0
? "has never previously been called"
: "was last called [" + TimeValue.timeValueMillis(timeSinceLastRerouteMillis) + "] ago",
nodeSummary(lastSetOfHotSpottedNodes),
state.nodes().size()
);
}
Expand All @@ -115,7 +126,10 @@ public void onNewInfo(ClusterInfo clusterInfo) {
lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
} else {
logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots");
logger.debug(
"Not calling reroute because we called reroute [{}] ago and there are no new hot spots",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"Not calling reroute because we called reroute [{}] ago and there are no new hot spots",
"Not calling reroute because we called reroute [{}] ago or there are no new hot spots",

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this is and. If either enough time passed or a new hot spot, then the if-statement triggers. So the else-statement is when time has not passed AND there are no new hot-spots.

I tried to make that a bit clearer with the new comment before the if-else statement, but seems still tricky.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Err you are right.

TimeValue.timeValueMillis(timeSinceLastRerouteMillis)
);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public void testRerouteIsCalledWhenAHotSpotIsDetected() {
}

@TestLogging(
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
reason = "ensure we're skipping reroute for the right reason"
)
public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
Expand All @@ -81,7 +81,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
new MockLog.SeenEventExpectation(
"don't reroute due to global block",
WriteLoadConstraintMonitor.class.getCanonicalName(),
Level.DEBUG,
Level.TRACE,
"skipping monitor as the cluster state is not recovered yet"
)
);
Expand All @@ -93,7 +93,7 @@ public void testRerouteIsNotCalledWhenStateIsNotRecovered() {
}

@TestLogging(
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
reason = "ensure we're skipping reroute for the right reason"
)
public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
Expand All @@ -117,7 +117,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
new MockLog.SeenEventExpectation(
"don't reroute due to decider being disabled",
WriteLoadConstraintMonitor.class.getCanonicalName(),
Level.DEBUG,
Level.TRACE,
"skipping monitor because the write load decider is not fully enabled"
)
);
Expand All @@ -129,7 +129,7 @@ public void testRerouteIsNotCalledWhenDeciderIsNotEnabled() {
}

@TestLogging(
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
reason = "ensure we're skipping reroute for the right reason"
)
public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
Expand All @@ -146,7 +146,7 @@ public void testRerouteIsNotCalledWhenNoNodesAreHotSpotting() {
new MockLog.SeenEventExpectation(
"don't reroute due to no nodes hot-spotting",
WriteLoadConstraintMonitor.class.getCanonicalName(),
Level.DEBUG,
Level.TRACE,
"No hot-spotting nodes detected"
)
);
Expand Down Expand Up @@ -196,7 +196,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
"don't reroute due to reroute being called recently",
WriteLoadConstraintMonitor.class.getCanonicalName(),
Level.DEBUG,
"Not calling reroute because we called reroute recently and there are no new hot spots"
"Not calling reroute because we called reroute * ago and there are no new hot spots"
)
);
writeLoadConstraintMonitor.onNewInfo(testState.clusterInfo);
Expand All @@ -213,7 +213,7 @@ public void testRerouteIsNotCalledAgainBeforeMinimumIntervalHasPassed() {
}

@TestLogging(
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:DEBUG",
value = "org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintMonitor:TRACE",
reason = "ensure we're skipping reroute for the right reason"
)
public void testRerouteIsCalledBeforeMinimumIntervalHasPassedIfNewNodesBecomeHotSpotted() {
Expand Down