-
Notifications
You must be signed in to change notification settings - Fork 25.5k
Update write load monitor log-levels #136137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -21,6 +21,7 @@ | |||||
import org.elasticsearch.common.Strings; | ||||||
import org.elasticsearch.common.settings.ClusterSettings; | ||||||
import org.elasticsearch.common.util.set.Sets; | ||||||
import org.elasticsearch.core.TimeValue; | ||||||
import org.elasticsearch.gateway.GatewayService; | ||||||
import org.elasticsearch.threadpool.ThreadPool; | ||||||
|
||||||
|
@@ -62,12 +63,12 @@ public WriteLoadConstraintMonitor( | |||||
public void onNewInfo(ClusterInfo clusterInfo) { | ||||||
final ClusterState state = clusterStateSupplier.get(); | ||||||
if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) { | ||||||
logger.debug("skipping monitor as the cluster state is not recovered yet"); | ||||||
logger.trace("skipping monitor as the cluster state is not recovered yet"); | ||||||
return; | ||||||
} | ||||||
|
||||||
if (writeLoadConstraintSettings.getWriteLoadConstraintEnabled().notFullyEnabled()) { | ||||||
logger.debug("skipping monitor because the write load decider is not fully enabled"); | ||||||
logger.trace("skipping monitor because the write load decider is not fully enabled"); | ||||||
return; | ||||||
} | ||||||
|
||||||
|
@@ -85,7 +86,7 @@ public void onNewInfo(ClusterInfo clusterInfo) { | |||||
}); | ||||||
|
||||||
if (nodeIdsExceedingLatencyThreshold.isEmpty()) { | ||||||
logger.debug("No hot-spotting nodes detected"); | ||||||
logger.trace("No hot-spotting nodes detected"); | ||||||
return; | ||||||
} | ||||||
|
||||||
|
@@ -94,12 +95,20 @@ public void onNewInfo(ClusterInfo clusterInfo) { | |||||
final boolean haveCalledRerouteRecently = timeSinceLastRerouteMillis < writeLoadConstraintSettings.getMinimumRerouteInterval() | ||||||
.millis(); | ||||||
|
||||||
// We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots | ||||||
// or hot-spots that are persisting and need further balancing work. | ||||||
if (haveCalledRerouteRecently == false | ||||||
|| Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) { | ||||||
if (logger.isDebugEnabled()) { | ||||||
logger.debug( | ||||||
"Found {} exceeding the write thread pool queue latency threshold ({} total), triggering reroute", | ||||||
""" | ||||||
Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting was last called [{}] ago. \ | ||||||
Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute. | ||||||
""", | ||||||
nodeSummary(nodeIdsExceedingLatencyThreshold), | ||||||
state.nodes().size(), | ||||||
lastRerouteTimeMillis == 0 ? "never" : TimeValue.timeValueMillis(timeSinceLastRerouteMillis), | ||||||
nodeSummary(lastSetOfHotSpottedNodes), | ||||||
state.nodes().size() | ||||||
); | ||||||
} | ||||||
|
@@ -115,7 +124,10 @@ public void onNewInfo(ClusterInfo clusterInfo) { | |||||
lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong(); | ||||||
lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold; | ||||||
} else { | ||||||
logger.debug("Not calling reroute because we called reroute recently and there are no new hot spots"); | ||||||
logger.debug( | ||||||
"Not calling reroute because we called reroute [{}] ago and there are no new hot spots", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is I tried to make that a bit clearer with the new comment before the if-else statement, but seems still tricky. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Err you are right. |
||||||
TimeValue.timeValueMillis(timeSinceLastRerouteMillis) | ||||||
); | ||||||
} | ||||||
} | ||||||
|
||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With
never
, the log message will bewas last called [never] ago
which reads rather odd. I suggest we either just useTimeValue.timeValueMillis(timeSinceLastRerouteMillis)
similar to how that is used in theelse
branch. Or adjust the logging message further so it reads something likewas never called
. Also, it would be great to handle it similarly here and in theelse
branch.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, it doesn't flow. I figured that it was clear, but I can make it better 👍
The else-statement has the advantage of not encountering
never
because it's logically impossible (haveCalledRerouteRecently is always true in that branch). Doesn't apply in this case.