Skip to content

Commit d7fda61

Browse files
WriteLoadConstraintMonitor will not call reroute if no nodes are below threshold (elastic#136925)
The write load monitor will no longer attempt to address hot-spots with a reroute request if there are no nodes below the queue latency threshold to receive relocated shards. Excludes search role nodes when considering hot-spotting nodes and relocation target nodes. Closes ES-13237
1 parent 9ad14c5 commit d7fda61

File tree

4 files changed

+204
-60
lines changed

4 files changed

+204
-60
lines changed

server/src/internalClusterTest/java/org/elasticsearch/cluster/allocation/WriteLoadConstraintMonitorIT.java

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
5151
internalCluster().startMasterOnlyNode(settings);
5252
final String dataNodeOne = internalCluster().startDataOnlyNode(settings);
5353
final String dataNodeTwo = internalCluster().startDataOnlyNode(settings);
54+
// Maintain a third node so that there's always at least one non-hot-spotting node that can receive shards.
55+
internalCluster().startDataOnlyNode(settings);
5456

5557
// Unmodified cluster info should detect no hot-spotting nodes
5658
MockLog.awaitLogger(
@@ -60,7 +62,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
6062
"no hot-spots detected",
6163
WriteLoadConstraintMonitor.class.getCanonicalName(),
6264
Level.TRACE,
63-
"No hot-spotting nodes detected"
65+
"No hot-spotting write nodes detected"
6466
)
6567
);
6668

@@ -76,7 +78,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
7678
WriteLoadConstraintMonitor.class.getCanonicalName(),
7779
Level.DEBUG,
7880
Strings.format("""
79-
Nodes [[%s]] are hot-spotting, of 3 total cluster nodes. Reroute for hot-spotting has never previously been called. \
81+
Nodes [[%s]] are hot-spotting, of 4 total cluster nodes. Reroute for hot-spotting has never previously been called. \
8082
Previously hot-spotting nodes are [0 nodes]. The write thread pool queue latency threshold is [%s]. \
8183
Triggering reroute.
8284
""", getNodeId(dataNodeOne), TimeValue.timeValueMillis(queueLatencyThresholdMillis))
@@ -111,7 +113,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
111113
WriteLoadConstraintMonitor.class.getCanonicalName(),
112114
Level.DEBUG,
113115
Strings.format("""
114-
Nodes [[*]] are hot-spotting, of 3 total cluster nodes. \
116+
Nodes [[*]] are hot-spotting, of 4 total cluster nodes. \
115117
Reroute for hot-spotting was last called [*] ago. Previously hot-spotting nodes are [[%s]]. \
116118
The write thread pool queue latency threshold is [%s]. Triggering reroute.
117119
""", getNodeId(dataNodeOne), TimeValue.timeValueMillis(queueLatencyThresholdMillis))
@@ -130,7 +132,7 @@ public void testRerouteIsCalledWhenHotSpotAppears() {
130132
"no hot-spots detected",
131133
WriteLoadConstraintMonitor.class.getCanonicalName(),
132134
Level.TRACE,
133-
"No hot-spotting nodes detected"
135+
"No hot-spotting write nodes detected"
134136
)
135137
);
136138
}

server/src/main/java/org/elasticsearch/cluster/routing/allocation/WriteLoadConstraintMonitor.java

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import org.elasticsearch.cluster.ClusterInfoService;
1717
import org.elasticsearch.cluster.ClusterState;
1818
import org.elasticsearch.cluster.NodeUsageStatsForThreadPools;
19+
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1920
import org.elasticsearch.cluster.routing.RerouteService;
2021
import org.elasticsearch.common.Priority;
2122
import org.elasticsearch.common.Strings;
@@ -26,6 +27,7 @@
2627
import org.elasticsearch.threadpool.ThreadPool;
2728

2829
import java.util.Set;
30+
import java.util.concurrent.atomic.AtomicBoolean;
2931
import java.util.function.LongSupplier;
3032
import java.util.function.Supplier;
3133

@@ -75,18 +77,32 @@ public void onNewInfo(ClusterInfo clusterInfo) {
7577
logger.trace("processing new cluster info");
7678

7779
final int numberOfNodes = clusterInfo.getNodeUsageStatsForThreadPools().size();
78-
final Set<String> nodeIdsExceedingLatencyThreshold = Sets.newHashSetWithExpectedSize(numberOfNodes);
80+
final Set<String> writeNodeIdsExceedingQueueLatencyThreshold = Sets.newHashSetWithExpectedSize(numberOfNodes);
81+
AtomicBoolean haveWriteNodesBelowQueueLatencyThreshold = new AtomicBoolean(false);
7982
clusterInfo.getNodeUsageStatsForThreadPools().forEach((nodeId, usageStats) -> {
83+
if (state.getNodes().get(nodeId).getRoles().contains(DiscoveryNodeRole.SEARCH_ROLE)) {
84+
// Search nodes are not expected to have write load hot-spots and are not considered for shard relocation.
85+
// TODO (ES-13314): consider stateful data tiers
86+
return;
87+
}
8088
final NodeUsageStatsForThreadPools.ThreadPoolUsageStats writeThreadPoolStats = usageStats.threadPoolUsageStatsMap()
8189
.get(ThreadPool.Names.WRITE);
8290
assert writeThreadPoolStats != null : "Write thread pool is not publishing usage stats for node [" + nodeId + "]";
83-
if (writeThreadPoolStats.maxThreadPoolQueueLatencyMillis() > writeLoadConstraintSettings.getQueueLatencyThreshold().millis()) {
84-
nodeIdsExceedingLatencyThreshold.add(nodeId);
91+
if (writeThreadPoolStats.maxThreadPoolQueueLatencyMillis() >= writeLoadConstraintSettings.getQueueLatencyThreshold().millis()) {
92+
writeNodeIdsExceedingQueueLatencyThreshold.add(nodeId);
93+
} else {
94+
haveWriteNodesBelowQueueLatencyThreshold.set(true);
8595
}
8696
});
8797

88-
if (nodeIdsExceedingLatencyThreshold.isEmpty()) {
89-
logger.trace("No hot-spotting nodes detected");
98+
if (writeNodeIdsExceedingQueueLatencyThreshold.isEmpty()) {
99+
logger.trace("No hot-spotting write nodes detected");
100+
return;
101+
}
102+
if (haveWriteNodesBelowQueueLatencyThreshold.get() == false) {
103+
logger.debug("""
104+
Nodes [{}] are above the queue latency threshold, but there are no write nodes below the threshold. \
105+
Cannot rebalance shards.""", nodeSummary(writeNodeIdsExceedingQueueLatencyThreshold));
90106
return;
91107
}
92108

@@ -98,14 +114,14 @@ public void onNewInfo(ClusterInfo clusterInfo) {
98114
// We know that there is at least one hot-spotting node if we've reached this code. Now check whether there are any new hot-spots
99115
// or hot-spots that are persisting and need further balancing work.
100116
if (haveCalledRerouteRecently == false
101-
|| Sets.difference(nodeIdsExceedingLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
117+
|| Sets.difference(writeNodeIdsExceedingQueueLatencyThreshold, lastSetOfHotSpottedNodes).isEmpty() == false) {
102118
if (logger.isDebugEnabled()) {
103119
logger.debug(
104120
"""
105121
Nodes [{}] are hot-spotting, of {} total cluster nodes. Reroute for hot-spotting {}. \
106122
Previously hot-spotting nodes are [{}]. The write thread pool queue latency threshold is [{}]. Triggering reroute.
107123
""",
108-
nodeSummary(nodeIdsExceedingLatencyThreshold),
124+
nodeSummary(writeNodeIdsExceedingQueueLatencyThreshold),
109125
state.nodes().size(),
110126
lastRerouteTimeMillis == 0
111127
? "has never previously been called"
@@ -124,7 +140,7 @@ public void onNewInfo(ClusterInfo clusterInfo) {
124140
)
125141
);
126142
lastRerouteTimeMillis = currentTimeMillisSupplier.getAsLong();
127-
lastSetOfHotSpottedNodes = nodeIdsExceedingLatencyThreshold;
143+
lastSetOfHotSpottedNodes = writeNodeIdsExceedingQueueLatencyThreshold;
128144
} else {
129145
logger.debug(
130146
"Not calling reroute because we called reroute [{}] ago and there are no new hot spots",

0 commit comments

Comments
 (0)