elastic · nicktindall · Sep 10, 2025 · Sep 1, 2025 · Sep 1, 2025 · Sep 3, 2025
diff --git a/...alClusterTest/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPoolsCollectorIT.java b/...alClusterTest/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPoolsCollectorIT.java
@@ -0,0 +1,145 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the "Elastic License
+ * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
+ * Public License v 1"; you may not use this file except in compliance with, at
+ * your election, the "Elastic License 2.0", the "GNU Affero General Public
+ * License v3.0 only", or the "Server Side Public License, v 1".
+ */
+
+package org.elasticsearch.cluster;
+
+import org.elasticsearch.action.admin.cluster.node.usage.NodeUsageStatsForThreadPoolsAction;
+import org.elasticsearch.action.admin.cluster.node.usage.TransportNodeUsageStatsForThreadPoolsAction;
+import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings;
+import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.util.CollectionUtils;
+import org.elasticsearch.common.util.Maps;
+import org.elasticsearch.core.Nullable;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.test.transport.MockTransportService;
+import org.elasticsearch.threadpool.ThreadPool;
+import org.elasticsearch.transport.TestTransportChannel;
+
+import java.time.Instant;
+import java.util.Collection;
+import java.util.Objects;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.hasKey;
+
+@ESIntegTestCase.ClusterScope(scope = ESIntegTestCase.Scope.TEST)
+public class NodeUsageStatsForThreadPoolsCollectorIT extends ESIntegTestCase {
+
+    @Override
+    protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
+        return Settings.builder()
+            .put(super.nodeSettings(nodeOrdinal, otherSettings))
+            // Need to enable write load decider to enable node usage stats collection
+            .put(
+                WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_ENABLED_SETTING.getKey(),
+                WriteLoadConstraintSettings.WriteLoadDeciderStatus.ENABLED
+            )
+            .build();
+    }
+
+    @Override
+    protected Collection<Class<? extends Plugin>> nodePlugins() {
+        return CollectionUtils.appendToCopy(super.nodePlugins(), MockTransportService.TestPlugin.class);
+    }
+
+    public void testMostRecentValueIsUsedWhenNodeRequestFails() {
+        final var dataNodeName = internalCluster().startDataOnlyNode();
+        final var dataNodeClusterService = internalCluster().getInstance(ClusterService.class, dataNodeName);
+        final var dataNodeTransportService = MockTransportService.getInstance(dataNodeName);
+        final var threadPoolName = randomFrom(ThreadPool.Names.GENERIC, ThreadPool.Names.WRITE, ThreadPool.Names.SEARCH);
+
+        // Intercept the node request and return some fake values
+        final int totalThreadPoolThreads = randomIntBetween(2, 40);
+        final float averageThreadPoolUtilization = randomFloatBetween(0.0f, 1.0f, true);
+        final long maxThreadPoolQueueLatencyMillis = randomLongBetween(0, 1000);
+        dataNodeTransportService.addRequestHandlingBehavior(
+            TransportNodeUsageStatsForThreadPoolsAction.NAME + "[n]",
+            (handler, request, channel, task) -> {
+                NodeUsageStatsForThreadPoolsAction.NodeResponse response = safeAwait(
+                    l -> handler.messageReceived(
+                        request,
+                        new TestTransportChannel(l.map(res -> (NodeUsageStatsForThreadPoolsAction.NodeResponse) res)),
+                        task
+                    )
+                );
+                final var responseStats = response.getNodeUsageStatsForThreadPools();
+                channel.sendResponse(
+                    new NodeUsageStatsForThreadPoolsAction.NodeResponse(
+                        response.getNode(),
+                        new NodeUsageStatsForThreadPools(
+                            responseStats.nodeId(),
+                            Maps.copyMapWithAddedOrReplacedEntry(
+                                responseStats.threadPoolUsageStatsMap(),
+                                threadPoolName,
+                                new NodeUsageStatsForThreadPools.ThreadPoolUsageStats(
+                                    totalThreadPoolThreads,
+                                    averageThreadPoolUtilization,
+                                    maxThreadPoolQueueLatencyMillis
+                                )
+                            ),
+                            Instant.now()
+                        )
+                    )
+                );
+            }
+        );
+
+        // This info should contain our fake values
+        final var successfulStats = assertThreadPoolHasStats(
+            dataNodeClusterService.localNode().getId(),
+            threadPoolName,
+            totalThreadPoolThreads,
+            averageThreadPoolUtilization,
+            maxThreadPoolQueueLatencyMillis,
+            null
+        );
+
+        // Now simulate an error
+        dataNodeTransportService.addRequestHandlingBehavior(
+            TransportNodeUsageStatsForThreadPoolsAction.NAME + "[n]",
+            (handler, request, channel, task) -> {
+                channel.sendResponse(new Exception("simulated error"));
+            }
+        );
+
+        // The next response should also contain our fake values
+        assertThreadPoolHasStats(
+            dataNodeClusterService.localNode().getId(),
+            threadPoolName,
+            totalThreadPoolThreads,
+            averageThreadPoolUtilization,
+            maxThreadPoolQueueLatencyMillis,
+            successfulStats.timestamp()
+        );
+    }
+
+    private NodeUsageStatsForThreadPools assertThreadPoolHasStats(
+        String nodeId,
+        String threadPoolName,
+        int totalThreadPoolThreads,
+        float averageThreadPoolUtilization,
+        long maxThreadPoolQueueLatencyMillis,
+        @Nullable Instant timestamp
+    ) {
+        final var clusterInfo = Objects.requireNonNull(refreshClusterInfo());
+        final var nodeUsageStatsForThreadPools = clusterInfo.getNodeUsageStatsForThreadPools().get(nodeId);
+        if (timestamp != null) {
+            assertThat(nodeUsageStatsForThreadPools.timestamp(), equalTo(timestamp));
+        }
+        final var usageStatsMap = nodeUsageStatsForThreadPools.threadPoolUsageStatsMap();
+        assertThat(usageStatsMap, hasKey(threadPoolName));
+        final var threadPoolStats = usageStatsMap.get(threadPoolName);
+        assertThat(threadPoolStats.totalThreadPoolThreads(), equalTo(totalThreadPoolThreads));
+        assertThat(threadPoolStats.averageThreadPoolUtilization(), equalTo(averageThreadPoolUtilization));
+        assertThat(threadPoolStats.maxThreadPoolQueueLatencyMillis(), equalTo(maxThreadPoolQueueLatencyMillis));
+        return nodeUsageStatsForThreadPools;
+    }
+}
diff --git a/...va/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java b/...va/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDeciderIT.java
@@ -43,6 +43,7 @@
 import org.elasticsearch.transport.TransportService;
 
 import java.nio.file.Path;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@@ -375,7 +376,7 @@ private NodeUsageStatsForThreadPools createNodeUsageStatsForThreadPools(
             )
         );
 
-        return new NodeUsageStatsForThreadPools(discoveryNode.getId(), threadPoolUsageMap);
+        return new NodeUsageStatsForThreadPools(discoveryNode.getId(), threadPoolUsageMap, Instant.now());
     }
 
     /**

diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -348,6 +348,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion INFERENCE_API_DISABLE_EIS_RATE_LIMITING = def(9_152_0_00);
     public static final TransportVersion GEMINI_THINKING_BUDGET_ADDED = def(9_153_0_00);
     public static final TransportVersion VISIT_PERCENTAGE = def(9_154_0_00);
+    public static final TransportVersion TIMESTAMP_IN_NODE_USAGE_STATS_FOR_THREAD_POOLS = def(9_155_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/...org/elasticsearch/action/admin/cluster/node/usage/NodeUsageStatsForThreadPoolsAction.java b/...org/elasticsearch/action/admin/cluster/node/usage/NodeUsageStatsForThreadPoolsAction.java
@@ -108,7 +108,7 @@ public static class NodeResponse extends BaseNodeResponse {
 
         protected NodeResponse(StreamInput in, DiscoveryNode node) throws IOException {
             super(in, node);
-            this.nodeUsageStatsForThreadPools = new NodeUsageStatsForThreadPools(in);
+            this.nodeUsageStatsForThreadPools = NodeUsageStatsForThreadPools.readFrom(in);
         }
 
         public NodeResponse(DiscoveryNode node, NodeUsageStatsForThreadPools nodeUsageStatsForThreadPools) {
@@ -118,7 +118,7 @@ public NodeResponse(DiscoveryNode node, NodeUsageStatsForThreadPools nodeUsageSt
 
         public NodeResponse(StreamInput in) throws IOException {
             super(in);
-            this.nodeUsageStatsForThreadPools = new NodeUsageStatsForThreadPools(in);
+            this.nodeUsageStatsForThreadPools = NodeUsageStatsForThreadPools.readFrom(in);
         }
 
         public NodeUsageStatsForThreadPools getNodeUsageStatsForThreadPools() {

diff --git a/...icsearch/action/admin/cluster/node/usage/TransportNodeUsageStatsForThreadPoolsAction.java b/...icsearch/action/admin/cluster/node/usage/TransportNodeUsageStatsForThreadPoolsAction.java
@@ -27,8 +27,8 @@
 import org.elasticsearch.transport.TransportService;
 
 import java.io.IOException;
+import java.time.Instant;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -117,11 +117,9 @@ protected NodeUsageStatsForThreadPoolsAction.NodeResponse nodeOperation(
             maxQueueLatencyMillis
         );
 
-        Map<String, ThreadPoolUsageStats> perThreadPool = new HashMap<>();
-        perThreadPool.put(ThreadPool.Names.WRITE, threadPoolUsageStats);
         return new NodeUsageStatsForThreadPoolsAction.NodeResponse(
             localNode,
-            new NodeUsageStatsForThreadPools(localNode.getId(), perThreadPool)
+            new NodeUsageStatsForThreadPools(localNode.getId(), Map.of(ThreadPool.Names.WRITE, threadPoolUsageStats), Instant.now())
         );
     }
 

diff --git a/server/src/main/java/org/elasticsearch/cluster/ClusterInfo.java b/server/src/main/java/org/elasticsearch/cluster/ClusterInfo.java
@@ -110,7 +110,7 @@ public ClusterInfo(StreamInput in) throws IOException {
             this.estimatedHeapUsages = Map.of();
         }
         if (in.getTransportVersion().onOrAfter(TransportVersions.NODE_USAGE_STATS_FOR_THREAD_POOLS_IN_CLUSTER_INFO)) {
-            this.nodeUsageStatsForThreadPools = in.readImmutableMap(NodeUsageStatsForThreadPools::new);
+            this.nodeUsageStatsForThreadPools = in.readImmutableMap(NodeUsageStatsForThreadPools::readFrom);
         } else {
             this.nodeUsageStatsForThreadPools = Map.of();
         }

diff --git a/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPools.java b/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPools.java
@@ -9,13 +9,14 @@
 
 package org.elasticsearch.cluster;
 
+import org.elasticsearch.TransportVersions;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
 
 import java.io.IOException;
+import java.time.Instant;
 import java.util.Map;
-import java.util.Objects;
 
 /**
  * Record of a node's thread pool usage stats (operation load). Maps thread pool stats by thread pool name.
@@ -24,48 +25,29 @@
  * @param threadPoolUsageStatsMap A map of thread pool name ({@link org.elasticsearch.threadpool.ThreadPool.Names}) to the thread pool's
  *                                usage stats ({@link ThreadPoolUsageStats}).
  */
-public record NodeUsageStatsForThreadPools(String nodeId, Map<String, ThreadPoolUsageStats> threadPoolUsageStatsMap) implements Writeable {
+public record NodeUsageStatsForThreadPools(String nodeId, Map<String, ThreadPoolUsageStats> threadPoolUsageStatsMap, Instant timestamp)
+    implements
+        Writeable {
 
-    public NodeUsageStatsForThreadPools(StreamInput in) throws IOException {
-        this(in.readString(), in.readImmutableMap(ThreadPoolUsageStats::new));
+    public static NodeUsageStatsForThreadPools readFrom(StreamInput in) throws IOException {
+        final var nodeId = in.readString();
+        final var threadPoolUsageStatsMap = in.readImmutableMap(ThreadPoolUsageStats::new);
+        final Instant receivedTime;
+        if (in.getTransportVersion().onOrAfter(TransportVersions.TIMESTAMP_IN_NODE_USAGE_STATS_FOR_THREAD_POOLS)) {
+            receivedTime = in.readInstant();
+        } else {
+            receivedTime = Instant.now();
+        }
+        return new NodeUsageStatsForThreadPools(nodeId, threadPoolUsageStatsMap, receivedTime);
     }
 
     @Override
     public void writeTo(StreamOutput out) throws IOException {
         out.writeString(this.nodeId);
         out.writeMap(this.threadPoolUsageStatsMap, StreamOutput::writeWriteable);
-    }
-
-    @Override
-    public int hashCode() {
-        return Objects.hash(nodeId, threadPoolUsageStatsMap);
-    }
-
-    @Override
-    public boolean equals(Object o) {
-        if (this == o) return true;
-        if (o == null || getClass() != o.getClass()) return false;
-        NodeUsageStatsForThreadPools other = (NodeUsageStatsForThreadPools) o;
-        for (var entry : other.threadPoolUsageStatsMap.entrySet()) {
-            if (nodeId.equals(other.nodeId) == false) {
-                return false;
-            }
-            var loadStats = threadPoolUsageStatsMap.get(entry.getKey());
-            if (loadStats == null || loadStats.equals(entry.getValue()) == false) {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    @Override
-    public String toString() {
-        StringBuilder builder = new StringBuilder(getClass().getSimpleName() + "{nodeId=" + nodeId + ", threadPoolUsageStatsMap=[");
-        for (var entry : threadPoolUsageStatsMap.entrySet()) {
-            builder.append("{ThreadPool.Names=" + entry.getKey() + ", ThreadPoolUsageStats=" + entry.getValue() + "}");
+        if (out.getTransportVersion().onOrAfter(TransportVersions.TIMESTAMP_IN_NODE_USAGE_STATS_FOR_THREAD_POOLS)) {
+            out.writeInstant(this.timestamp);
         }
-        builder.append("]}");
-        return builder.toString();
     }
 
     /**

diff --git a/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPoolsCollector.java b/server/src/main/java/org/elasticsearch/cluster/NodeUsageStatsForThreadPoolsCollector.java
@@ -11,16 +11,22 @@
 
 import org.elasticsearch.TransportVersion;
 import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.action.FailedNodeException;
 import org.elasticsearch.action.admin.cluster.node.usage.NodeUsageStatsForThreadPoolsAction;
 import org.elasticsearch.action.admin.cluster.node.usage.TransportNodeUsageStatsForThreadPoolsAction;
 import org.elasticsearch.client.internal.Client;
+import org.elasticsearch.cluster.node.DiscoveryNode;
 
+import java.util.Arrays;
+import java.util.HashMap;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * Collects the thread pool usage stats for each node in the cluster.
  * <p>
- * Results are returned as a map of node ID to node usage stats.
+ * Results are returned as a map of node ID to node usage stats. Keeps track of the most recent
+ * usage stats for each node, which will be returned in the event of a failure response from that node.
  */
 public class NodeUsageStatsForThreadPoolsCollector {
     public static final NodeUsageStatsForThreadPoolsCollector EMPTY = new NodeUsageStatsForThreadPoolsCollector() {
@@ -37,6 +43,8 @@ public void collectUsageStats(
         "transport_node_usage_stats_for_thread_pools_action"
     );
 
+    private final Map<String, NodeUsageStatsForThreadPools> lastNodeUsageStatsPerNode = new ConcurrentHashMap<>();
+
     /**
      * Collects the thread pool usage stats ({@link NodeUsageStatsForThreadPools}) for each node in the cluster.
      *
@@ -47,15 +55,39 @@ public void collectUsageStats(
         ClusterState clusterState,
         ActionListener<Map<String, NodeUsageStatsForThreadPools>> listener
     ) {
-        var dataNodeIds = clusterState.nodes().getDataNodes().values().stream().map(node -> node.getId()).toArray(String[]::new);
+        var dataNodeIds = clusterState.nodes().getDataNodes().values().stream().map(DiscoveryNode::getId).toArray(String[]::new);
+        // Discard last-seen values for any nodes no longer present in the cluster state
+        lastNodeUsageStatsPerNode.keySet().retainAll(Arrays.asList(dataNodeIds));
         if (clusterState.getMinTransportVersion().supports(TRANSPORT_NODE_USAGE_STATS_FOR_THREAD_POOLS_ACTION)) {
             client.execute(
                 TransportNodeUsageStatsForThreadPoolsAction.TYPE,
                 new NodeUsageStatsForThreadPoolsAction.Request(dataNodeIds),
-                listener.map(response -> response.getAllNodeUsageStatsForThreadPools())
+                listener.map(this::replaceFailuresWithLastSeenValues)
             );
         } else {
             listener.onResponse(Map.of());
         }
     }
+
+    private Map<String, NodeUsageStatsForThreadPools> replaceFailuresWithLastSeenValues(
+        NodeUsageStatsForThreadPoolsAction.Response response
+    ) {
+        final Map<String, NodeUsageStatsForThreadPools> returnedUsageStats = response.getAllNodeUsageStatsForThreadPools();
+        // Update the last-seen usage stats
+        this.lastNodeUsageStatsPerNode.putAll(returnedUsageStats);
+
+        if (response.hasFailures() == false) {
+            return returnedUsageStats;
+        }
+
+        // Add in the last-seen usage stats for any nodes that failed to respond
+        final Map<String, NodeUsageStatsForThreadPools> cachedValuesForFailed = new HashMap<>(returnedUsageStats);
+        for (FailedNodeException failedNodeException : response.failures()) {
+            final var nodeUsageStatsForThreadPools = lastNodeUsageStatsPerNode.get(failedNodeException.nodeId());
+            if (nodeUsageStatsForThreadPools != null) {
+                cachedValuesForFailed.put(failedNodeException.nodeId(), nodeUsageStatsForThreadPools);
+            }
+        }
+        return cachedValuesForFailed;
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/ShardMovementWriteLoadSimulator.java b/server/src/main/java/org/elasticsearch/cluster/routing/ShardMovementWriteLoadSimulator.java
@@ -80,7 +80,8 @@ public Map<String, NodeUsageStatsForThreadPools> simulatedNodeUsageStatsForThrea
                             simulatedNodeWriteLoadDeltas.get(entry.getKey()),
                             nodesWithMovedAwayShard.contains(entry.getKey())
                         )
-                    )
+                    ),
+                    entry.getValue().timestamp()
                 );
                 adjustedNodeUsageStatsForThreadPools.put(entry.getKey(), adjustedValue);
             } else {