-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Use the last good NodeUsageStatsForThreadPools when a node returns an error #133896
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
640f0da
8c087f4
94d46a0
d072039
1c6e239
8b374dd
c3d3d3c
20fa020
5fdf7d1
6310e24
355f6d9
3decebd
36ab0fa
ff795ac
d5b17ab
ee9acc5
f8b9b4d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,145 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.cluster; | ||
|
|
||
| import org.elasticsearch.action.admin.cluster.node.usage.NodeUsageStatsForThreadPoolsAction; | ||
| import org.elasticsearch.action.admin.cluster.node.usage.TransportNodeUsageStatsForThreadPoolsAction; | ||
| import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings; | ||
| import org.elasticsearch.cluster.service.ClusterService; | ||
| import org.elasticsearch.common.settings.Settings; | ||
| import org.elasticsearch.common.util.CollectionUtils; | ||
| import org.elasticsearch.common.util.Maps; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.plugins.Plugin; | ||
| import org.elasticsearch.test.ESIntegTestCase; | ||
| import org.elasticsearch.test.transport.MockTransportService; | ||
| import org.elasticsearch.threadpool.ThreadPool; | ||
| import org.elasticsearch.transport.TestTransportChannel; | ||
|
|
||
| import java.time.Instant; | ||
| import java.util.Collection; | ||
| import java.util.Objects; | ||
|
|
||
| import static org.hamcrest.Matchers.equalTo; | ||
| import static org.hamcrest.Matchers.hasKey; | ||
|
|
||
| @ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST) | ||
| public class NodeUsageStatsForThreadPoolsCollectorIT extends ESIntegTestCase { | ||
|
|
||
| @Override | ||
| protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) { | ||
| return Settings.builder() | ||
| .put(super.nodeSettings(nodeOrdinal, otherSettings)) | ||
| // Need to enable write load decider to enable node usage stats collection | ||
| .put( | ||
| WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_ENABLED_SETTING.getKey(), | ||
| WriteLoadConstraintSettings.WriteLoadDeciderStatus.ENABLED | ||
| ) | ||
| .build(); | ||
| } | ||
|
|
||
| @Override | ||
| protected Collection<Class<? extends Plugin>> nodePlugins() { | ||
| return CollectionUtils.appendToCopy(super.nodePlugins(), MockTransportService.TestPlugin.class); | ||
| } | ||
|
|
||
| public void testMostRecentValueIsUsedWhenNodeRequestFails() { | ||
| final var dataNodeName = internalCluster().startDataOnlyNode(); | ||
| final var dataNodeClusterService = internalCluster().getInstance(ClusterService.class, dataNodeName); | ||
| final var dataNodeTransportService = MockTransportService.getInstance(dataNodeName); | ||
| final var threadPoolName = randomFrom(ThreadPool.Names.GENERIC, ThreadPool.Names.WRITE, ThreadPool.Names.SEARCH); | ||
|
|
||
| // Intercept the node request and return some fake values | ||
| final int totalThreadPoolThreads = randomIntBetween(2, 40); | ||
| final float averageThreadPoolUtilization = randomFloatBetween(0.0f, 1.0f, true); | ||
| final long maxThreadPoolQueueLatencyMillis = randomLongBetween(0, 1000); | ||
| dataNodeTransportService.addRequestHandlingBehavior( | ||
| TransportNodeUsageStatsForThreadPoolsAction.NAME + "[n]", | ||
| (handler, request, channel, task) -> { | ||
| NodeUsageStatsForThreadPoolsAction.NodeResponse response = safeAwait( | ||
| l -> handler.messageReceived( | ||
| request, | ||
| new TestTransportChannel(l.map(res -> (NodeUsageStatsForThreadPoolsAction.NodeResponse) res)), | ||
| task | ||
| ) | ||
| ); | ||
| final var responseStats = response.getNodeUsageStatsForThreadPools(); | ||
| channel.sendResponse( | ||
| new NodeUsageStatsForThreadPoolsAction.NodeResponse( | ||
| response.getNode(), | ||
| new NodeUsageStatsForThreadPools( | ||
| responseStats.nodeId(), | ||
| Maps.copyMapWithAddedOrReplacedEntry( | ||
| responseStats.threadPoolUsageStatsMap(), | ||
| threadPoolName, | ||
| new NodeUsageStatsForThreadPools.ThreadPoolUsageStats( | ||
| totalThreadPoolThreads, | ||
| averageThreadPoolUtilization, | ||
| maxThreadPoolQueueLatencyMillis | ||
| ) | ||
| ), | ||
| Instant.now() | ||
| ) | ||
| ) | ||
| ); | ||
| } | ||
| ); | ||
|
|
||
| // This info should contain our fake values | ||
| final var successfulStats = assertThreadPoolHasStats( | ||
| dataNodeClusterService.localNode().getId(), | ||
| threadPoolName, | ||
| totalThreadPoolThreads, | ||
| averageThreadPoolUtilization, | ||
| maxThreadPoolQueueLatencyMillis, | ||
| null | ||
| ); | ||
|
|
||
| // Now simulate an error | ||
| dataNodeTransportService.addRequestHandlingBehavior( | ||
| TransportNodeUsageStatsForThreadPoolsAction.NAME + "[n]", | ||
| (handler, request, channel, task) -> { | ||
| channel.sendResponse(new Exception("simulated error")); | ||
| } | ||
| ); | ||
|
|
||
| // The next response should also contain our fake values | ||
| assertThreadPoolHasStats( | ||
| dataNodeClusterService.localNode().getId(), | ||
| threadPoolName, | ||
| totalThreadPoolThreads, | ||
| averageThreadPoolUtilization, | ||
| maxThreadPoolQueueLatencyMillis, | ||
| successfulStats.timestamp() | ||
| ); | ||
| } | ||
|
|
||
| private NodeUsageStatsForThreadPools assertThreadPoolHasStats( | ||
DiannaHohensee marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| String nodeId, | ||
| String threadPoolName, | ||
| int totalThreadPoolThreads, | ||
| float averageThreadPoolUtilization, | ||
| long maxThreadPoolQueueLatencyMillis, | ||
| @Nullable Instant timestamp | ||
| ) { | ||
| final var clusterInfo = Objects.requireNonNull(refreshClusterInfo()); | ||
| final var nodeUsageStatsForThreadPools = clusterInfo.getNodeUsageStatsForThreadPools().get(nodeId); | ||
| if (timestamp != null) { | ||
| assertThat(nodeUsageStatsForThreadPools.timestamp(), equalTo(timestamp)); | ||
| } | ||
| final var usageStatsMap = nodeUsageStatsForThreadPools.threadPoolUsageStatsMap(); | ||
| assertThat(usageStatsMap, hasKey(threadPoolName)); | ||
| final var threadPoolStats = usageStatsMap.get(threadPoolName); | ||
| assertThat(threadPoolStats.totalThreadPoolThreads(), equalTo(totalThreadPoolThreads)); | ||
| assertThat(threadPoolStats.averageThreadPoolUtilization(), equalTo(averageThreadPoolUtilization)); | ||
| assertThat(threadPoolStats.maxThreadPoolQueueLatencyMillis(), equalTo(maxThreadPoolQueueLatencyMillis)); | ||
| return nodeUsageStatsForThreadPools; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,8 +27,8 @@ | |
| import org.elasticsearch.transport.TransportService; | ||
|
|
||
| import java.io.IOException; | ||
| import java.time.Instant; | ||
| import java.util.Collection; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
|
|
@@ -117,11 +117,9 @@ protected NodeUsageStatsForThreadPoolsAction.NodeResponse nodeOperation( | |
| maxQueueLatencyMillis | ||
| ); | ||
|
|
||
| Map<String, ThreadPoolUsageStats> perThreadPool = new HashMap<>(); | ||
| perThreadPool.put(ThreadPool.Names.WRITE, threadPoolUsageStats); | ||
| return new NodeUsageStatsForThreadPoolsAction.NodeResponse( | ||
| localNode, | ||
| new NodeUsageStatsForThreadPools(localNode.getId(), perThreadPool) | ||
| new NodeUsageStatsForThreadPools(localNode.getId(), Map.of(ThreadPool.Names.WRITE, threadPoolUsageStats), Instant.now()) | ||
|
||
| ); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,13 +9,14 @@ | |
|
|
||
| package org.elasticsearch.cluster; | ||
|
|
||
| import org.elasticsearch.TransportVersions; | ||
| import org.elasticsearch.common.io.stream.StreamInput; | ||
| import org.elasticsearch.common.io.stream.StreamOutput; | ||
| import org.elasticsearch.common.io.stream.Writeable; | ||
|
|
||
| import java.io.IOException; | ||
| import java.time.Instant; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
||
| /** | ||
| * Record of a node's thread pool usage stats (operation load). Maps thread pool stats by thread pool name. | ||
|
|
@@ -24,48 +25,29 @@ | |
| * @param threadPoolUsageStatsMap A map of thread pool name ({@link org.elasticsearch.threadpool.ThreadPool.Names}) to the thread pool's | ||
| * usage stats ({@link ThreadPoolUsageStats}). | ||
| */ | ||
| public record NodeUsageStatsForThreadPools(String nodeId, Map<String, ThreadPoolUsageStats> threadPoolUsageStatsMap) implements Writeable { | ||
| public record NodeUsageStatsForThreadPools(String nodeId, Map<String, ThreadPoolUsageStats> threadPoolUsageStatsMap, Instant timestamp) | ||
| implements | ||
| Writeable { | ||
|
|
||
| public NodeUsageStatsForThreadPools(StreamInput in) throws IOException { | ||
| this(in.readString(), in.readImmutableMap(ThreadPoolUsageStats::new)); | ||
| public static NodeUsageStatsForThreadPools readFrom(StreamInput in) throws IOException { | ||
| final var nodeId = in.readString(); | ||
| final var threadPoolUsageStatsMap = in.readImmutableMap(ThreadPoolUsageStats::new); | ||
| final Instant receivedTime; | ||
| if (in.getTransportVersion().onOrAfter(TransportVersions.TIMESTAMP_IN_NODE_USAGE_STATS_FOR_THREAD_POOLS)) { | ||
| receivedTime = in.readInstant(); | ||
| } else { | ||
| receivedTime = Instant.now(); | ||
| } | ||
| return new NodeUsageStatsForThreadPools(nodeId, threadPoolUsageStatsMap, receivedTime); | ||
| } | ||
|
|
||
| @Override | ||
| public void writeTo(StreamOutput out) throws IOException { | ||
| out.writeString(this.nodeId); | ||
| out.writeMap(this.threadPoolUsageStatsMap, StreamOutput::writeWriteable); | ||
| } | ||
|
|
||
| @Override | ||
| public int hashCode() { | ||
| return Objects.hash(nodeId, threadPoolUsageStatsMap); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object o) { | ||
| if (this == o) return true; | ||
| if (o == null || getClass() != o.getClass()) return false; | ||
| NodeUsageStatsForThreadPools other = (NodeUsageStatsForThreadPools) o; | ||
| for (var entry : other.threadPoolUsageStatsMap.entrySet()) { | ||
| if (nodeId.equals(other.nodeId) == false) { | ||
| return false; | ||
| } | ||
| var loadStats = threadPoolUsageStatsMap.get(entry.getKey()); | ||
| if (loadStats == null || loadStats.equals(entry.getValue()) == false) { | ||
| return false; | ||
| } | ||
| } | ||
| return true; | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Redundant because it's a record |
||
|
|
||
| @Override | ||
| public String toString() { | ||
| StringBuilder builder = new StringBuilder(getClass().getSimpleName() + "{nodeId=" + nodeId + ", threadPoolUsageStatsMap=["); | ||
| for (var entry : threadPoolUsageStatsMap.entrySet()) { | ||
| builder.append("{ThreadPool.Names=" + entry.getKey() + ", ThreadPoolUsageStats=" + entry.getValue() + "}"); | ||
| if (out.getTransportVersion().onOrAfter(TransportVersions.TIMESTAMP_IN_NODE_USAGE_STATS_FOR_THREAD_POOLS)) { | ||
| out.writeInstant(this.timestamp); | ||
| } | ||
| builder.append("]}"); | ||
| return builder.toString(); | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,16 +11,22 @@ | |
|
|
||
| import org.elasticsearch.TransportVersion; | ||
| import org.elasticsearch.action.ActionListener; | ||
| import org.elasticsearch.action.FailedNodeException; | ||
| import org.elasticsearch.action.admin.cluster.node.usage.NodeUsageStatsForThreadPoolsAction; | ||
| import org.elasticsearch.action.admin.cluster.node.usage.TransportNodeUsageStatsForThreadPoolsAction; | ||
| import org.elasticsearch.client.internal.Client; | ||
| import org.elasticsearch.cluster.node.DiscoveryNode; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.concurrent.ConcurrentHashMap; | ||
|
|
||
| /** | ||
| * Collects the thread pool usage stats for each node in the cluster. | ||
| * <p> | ||
| * Results are returned as a map of node ID to node usage stats. | ||
| * Results are returned as a map of node ID to node usage stats. Keeps track of the most recent | ||
| * usage stats for each node, which will be returned in the event of a failure response from that node. | ||
| */ | ||
| public class NodeUsageStatsForThreadPoolsCollector { | ||
| public static final NodeUsageStatsForThreadPoolsCollector EMPTY = new NodeUsageStatsForThreadPoolsCollector() { | ||
|
|
@@ -37,6 +43,8 @@ public void collectUsageStats( | |
| "transport_node_usage_stats_for_thread_pools_action" | ||
| ); | ||
|
|
||
| private final Map<String, NodeUsageStatsForThreadPools> lastNodeUsageStatsPerNode = new ConcurrentHashMap<>(); | ||
DiannaHohensee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /** | ||
| * Collects the thread pool usage stats ({@link NodeUsageStatsForThreadPools}) for each node in the cluster. | ||
| * | ||
|
|
@@ -47,15 +55,39 @@ public void collectUsageStats( | |
| ClusterState clusterState, | ||
| ActionListener<Map<String, NodeUsageStatsForThreadPools>> listener | ||
| ) { | ||
| var dataNodeIds = clusterState.nodes().getDataNodes().values().stream().map(node -> node.getId()).toArray(String[]::new); | ||
| var dataNodeIds = clusterState.nodes().getDataNodes().values().stream().map(DiscoveryNode::getId).toArray(String[]::new); | ||
| // Discard last-seen values for any nodes no longer present in the cluster state | ||
| lastNodeUsageStatsPerNode.keySet().retainAll(Arrays.asList(dataNodeIds)); | ||
| if (clusterState.getMinTransportVersion().supports(TRANSPORT_NODE_USAGE_STATS_FOR_THREAD_POOLS_ACTION)) { | ||
| client.execute( | ||
| TransportNodeUsageStatsForThreadPoolsAction.TYPE, | ||
| new NodeUsageStatsForThreadPoolsAction.Request(dataNodeIds), | ||
| listener.map(response -> response.getAllNodeUsageStatsForThreadPools()) | ||
| listener.map(this::replaceFailuresWithLastSeenValues) | ||
| ); | ||
| } else { | ||
| listener.onResponse(Map.of()); | ||
| } | ||
| } | ||
|
|
||
| private Map<String, NodeUsageStatsForThreadPools> replaceFailuresWithLastSeenValues( | ||
| NodeUsageStatsForThreadPoolsAction.Response response | ||
| ) { | ||
| final Map<String, NodeUsageStatsForThreadPools> returnedUsageStats = response.getAllNodeUsageStatsForThreadPools(); | ||
| // Update the last-seen usage stats | ||
| this.lastNodeUsageStatsPerNode.putAll(returnedUsageStats); | ||
|
|
||
| if (response.hasFailures() == false) { | ||
| return returnedUsageStats; | ||
| } | ||
|
|
||
| // Add in the last-seen usage stats for any nodes that failed to respond | ||
| final Map<String, NodeUsageStatsForThreadPools> cachedValuesForFailed = new HashMap<>(returnedUsageStats); | ||
| for (FailedNodeException failedNodeException : response.failures()) { | ||
| final var nodeUsageStatsForThreadPools = lastNodeUsageStatsPerNode.get(failedNodeException.nodeId()); | ||
|
||
| if (nodeUsageStatsForThreadPools != null) { | ||
| cachedValuesForFailed.put(failedNodeException.nodeId(), nodeUsageStatsForThreadPools); | ||
|
||
| } | ||
| } | ||
|
||
| return cachedValuesForFailed; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this annotation necessary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, fixed in d5b17ab