elastic · DiannaHohensee · Jul 10, 2025 · Jul 1, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/server/src/internalClusterTest/java/org/elasticsearch/threadpool/SimpleThreadPoolIT.java b/server/src/internalClusterTest/java/org/elasticsearch/threadpool/SimpleThreadPoolIT.java
@@ -40,7 +40,9 @@
 
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
 import static org.elasticsearch.threadpool.ThreadPool.DEFAULT_INDEX_AUTOSCALING_EWMA_ALPHA;
+import static org.elasticsearch.threadpool.ThreadPool.DEFAULT_WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA;
 import static org.elasticsearch.threadpool.ThreadPool.WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING;
+import static org.elasticsearch.threadpool.ThreadPool.WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA;
 import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
 import static org.hamcrest.Matchers.contains;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
@@ -234,19 +236,35 @@ public void assertValid(TestTelemetryPlugin testTelemetryPlugin, String metricSu
         }
     }
 
-    public void testWriteThreadpoolEwmaAlphaSetting() {
+    public void testWriteThreadpoolsEwmaAlphaSetting() {
         Settings settings = Settings.EMPTY;
-        var ewmaAlpha = DEFAULT_INDEX_AUTOSCALING_EWMA_ALPHA;
+        var executionEwmaAlpha = DEFAULT_INDEX_AUTOSCALING_EWMA_ALPHA;
+        var queueLatencyEwmaAlpha = DEFAULT_WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA;
         if (randomBoolean()) {
-            ewmaAlpha = randomDoubleBetween(0.0, 1.0, true);
-            settings = Settings.builder().put(WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING.getKey(), ewmaAlpha).build();
+            executionEwmaAlpha = randomDoubleBetween(0.0, 1.0, true);
+            queueLatencyEwmaAlpha = randomDoubleBetween(0.0, 1.0, true);
+            settings = Settings.builder()
+                .put(WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING.getKey(), executionEwmaAlpha)
+                .put(WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA.getKey(), queueLatencyEwmaAlpha)
+                .build();
         }
         var nodeName = internalCluster().startNode(settings);
         var threadPool = internalCluster().getInstance(ThreadPool.class, nodeName);
+
+        // Verify that the write thread pools all use the tracking executor.
         for (var name : List.of(ThreadPool.Names.WRITE, ThreadPool.Names.SYSTEM_WRITE, ThreadPool.Names.SYSTEM_CRITICAL_WRITE)) {
             assertThat(threadPool.executor(name), instanceOf(TaskExecutionTimeTrackingEsThreadPoolExecutor.class));
             final var executor = (TaskExecutionTimeTrackingEsThreadPoolExecutor) threadPool.executor(name);
-            assertThat(Double.compare(executor.getEwmaAlpha(), ewmaAlpha), CoreMatchers.equalTo(0));
+            assertThat(Double.compare(executor.getExecutionEwmaAlpha(), executionEwmaAlpha), CoreMatchers.equalTo(0));
+
+            // Only the WRITE thread pool should enable further tracking.
+            if (name.equals(ThreadPool.Names.WRITE) == false) {
+                assertFalse(executor.trackingQueueLatencyEwma());
+            } else {
+                // Verify that the WRITE thread pool has extra tracking enabled.
+                assertTrue(executor.trackingQueueLatencyEwma());
+                assertThat(Double.compare(executor.getQueueLatencyEwmaAlpha(), queueLatencyEwmaAlpha), CoreMatchers.equalTo(0));
+            }
         }
     }
 }
diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java
@@ -547,6 +547,7 @@ public void apply(Settings value, Settings current, Settings previous) {
         ThreadPool.LATE_TIME_INTERVAL_WARN_THRESHOLD_SETTING,
         ThreadPool.SLOW_SCHEDULER_TASK_WARN_THRESHOLD_SETTING,
         ThreadPool.WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING,
+        ThreadPool.WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA,
         FastVectorHighlighter.SETTING_TV_HIGHLIGHT_MULTI_VALUE,
         Node.BREAKER_TYPE_KEY,
         OperationRouting.USE_ADAPTIVE_REPLICA_SELECTION_SETTING,

@@ -577,24 +577,65 @@ public void rejectedExecution(Runnable task, ThreadPoolExecutor executor) {
     }
 
     public static class TaskTrackingConfig {
-        // This is a random starting point alpha. TODO: revisit this with actual testing and/or make it configurable
-        public static final double DEFAULT_EWMA_ALPHA = 0.3;
+        public static final double DEFAULT_EXECUTION_TIME_EWMA_ALPHA_FOR_TEST = 0.3;
+        public static final double DEFAULT_QUEUE_LATENCY_EWMA_ALPHA_FOR_TEST = 0.6;
 
         private final boolean trackExecutionTime;
         private final boolean trackOngoingTasks;
-        private final double ewmaAlpha;
+        private final boolean trackQueueLatencyEWMA;
+        private final double executionTimeEwmaAlpha;
+        private final double queueLatencyEWMAAlpha;
+
+        public static final TaskTrackingConfig DO_NOT_TRACK = new TaskTrackingConfig(
+            false,
+            false,
+            false,
+            DEFAULT_EXECUTION_TIME_EWMA_ALPHA_FOR_TEST,
+            DEFAULT_QUEUE_LATENCY_EWMA_ALPHA_FOR_TEST
+        );
+        public static final TaskTrackingConfig DEFAULT = new TaskTrackingConfig(
+            true,
+            false,
+            false,
+            DEFAULT_EXECUTION_TIME_EWMA_ALPHA_FOR_TEST,
+            DEFAULT_QUEUE_LATENCY_EWMA_ALPHA_FOR_TEST
+        );
 
-        public static final TaskTrackingConfig DO_NOT_TRACK = new TaskTrackingConfig(false, false, DEFAULT_EWMA_ALPHA);
-        public static final TaskTrackingConfig DEFAULT = new TaskTrackingConfig(true, false, DEFAULT_EWMA_ALPHA);
+        public TaskTrackingConfig(boolean trackOngoingTasks, double executionTimeEWMAAlpha) {
+            this(true, trackOngoingTasks, false, executionTimeEWMAAlpha, DEFAULT_QUEUE_LATENCY_EWMA_ALPHA_FOR_TEST);
+        }
 
-        public TaskTrackingConfig(boolean trackOngoingTasks, double ewmaAlpha) {
-            this(true, trackOngoingTasks, ewmaAlpha);
+        /**
+         * Execution tracking enabled constructor, with extra options to enable further specialized tracking.
+         */
+        public TaskTrackingConfig(
+            boolean trackOngoingTasks,
+            boolean trackQueueLatencyEWMA,
+            double executionTimeEWMAAlpha,
+            double queueLatencyEWMAAlpha
+        ) {
+            this(true, trackOngoingTasks, trackQueueLatencyEWMA, executionTimeEWMAAlpha, queueLatencyEWMAAlpha);
         }
 
-        private TaskTrackingConfig(boolean trackExecutionTime, boolean trackOngoingTasks, double EWMAAlpha) {
+        /**
+         * @param trackExecutionTime Whether to track execution stats
+         * @param trackOngoingTasks Whether to track ongoing task execution time, not just finished tasks
+         * @param trackQueueLatencyEWMA Whether to track queue latency {@link org.elasticsearch.common.ExponentiallyWeightedMovingAverage}
+         * @param executionTimeEWMAAlpha The alpha seed for execution time EWMA (ExponentiallyWeightedMovingAverage).
+         * @param queueLatencyEWMAAlpha The alpha seed for task queue latency EWMA (ExponentiallyWeightedMovingAverage).
+         */
+        private TaskTrackingConfig(
+            boolean trackExecutionTime,
+            boolean trackOngoingTasks,
+            boolean trackQueueLatencyEWMA,
+            double executionTimeEWMAAlpha,
+            double queueLatencyEWMAAlpha
+        ) {
             this.trackExecutionTime = trackExecutionTime;
             this.trackOngoingTasks = trackOngoingTasks;
-            this.ewmaAlpha = EWMAAlpha;
+            this.trackQueueLatencyEWMA = trackQueueLatencyEWMA;
+            this.executionTimeEwmaAlpha = executionTimeEWMAAlpha;
+            this.queueLatencyEWMAAlpha = queueLatencyEWMAAlpha;
         }
 
         public boolean trackExecutionTime() {
@@ -605,8 +646,16 @@ public boolean trackOngoingTasks() {
             return trackOngoingTasks;
         }
 
-        public double getEwmaAlpha() {
-            return ewmaAlpha;
+        public boolean trackQueueLatencyEWMA() {
+            return trackQueueLatencyEWMA;
+        }
+
+        public double getExecutionTimeEwmaAlpha() {
+            return executionTimeEwmaAlpha;
+        }
+
+        public double getQueueLatencyEwmaAlpha() {
+            return queueLatencyEWMAAlpha;
         }
     }
 

@@ -47,9 +47,13 @@ public final class TaskExecutionTimeTrackingEsThreadPoolExecutor extends EsThrea
     private final boolean trackOngoingTasks;
     // The set of currently running tasks and the timestamp of when they started execution in the Executor.
     private final Map<Runnable, Long> ongoingTasks = new ConcurrentHashMap<>();
-    private volatile long lastPollTime = System.nanoTime();
-    private volatile long lastTotalExecutionTime = 0;
+    private volatile long lastPollTimeAPM = System.nanoTime();
+    private volatile long lastTotalExecutionTimeAPM = 0;
+    private volatile long lastPollTimeNanosAllocation = System.nanoTime();
+    private volatile long lastTotalExecutionTimeAllocation = 0;
     private final ExponentialBucketHistogram queueLatencyMillisHistogram = new ExponentialBucketHistogram(QUEUE_LATENCY_HISTOGRAM_BUCKETS);
+    private final boolean trackQueueLatencyEWMA;
+    private final ExponentiallyWeightedMovingAverage queueLatencyMillisEWMA;
 
     TaskExecutionTimeTrackingEsThreadPoolExecutor(
         String name,
@@ -65,9 +69,12 @@ public final class TaskExecutionTimeTrackingEsThreadPoolExecutor extends EsThrea
         TaskTrackingConfig trackingConfig
     ) {
         super(name, corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler, contextHolder);
+
         this.runnableWrapper = runnableWrapper;
-        this.executionEWMA = new ExponentiallyWeightedMovingAverage(trackingConfig.getEwmaAlpha(), 0);
+        this.executionEWMA = new ExponentiallyWeightedMovingAverage(trackingConfig.getExecutionTimeEwmaAlpha(), 0);
         this.trackOngoingTasks = trackingConfig.trackOngoingTasks();
+        this.trackQueueLatencyEWMA = trackingConfig.trackQueueLatencyEWMA();
+        this.queueLatencyMillisEWMA = new ExponentiallyWeightedMovingAverage(trackingConfig.getQueueLatencyEwmaAlpha(), 0);
     }
 
     public List<Instrument> setupMetrics(MeterRegistry meterRegistry, String threadPoolName) {
@@ -95,7 +102,7 @@ public List<Instrument> setupMetrics(MeterRegistry meterRegistry, String threadP
                 ThreadPool.THREAD_POOL_METRIC_PREFIX + threadPoolName + THREAD_POOL_METRIC_NAME_UTILIZATION,
                 "fraction of maximum thread time utilized for " + threadPoolName,
                 "fraction",
-                () -> new DoubleWithAttributes(pollUtilization(), Map.of())
+                () -> new DoubleWithAttributes(pollUtilization(true, false), Map.of())
             )
         );
     }
@@ -136,23 +143,43 @@ public int getCurrentQueueSize() {
         return getQueue().size();
     }
 
+    public double getQueuedTaskLatencyMillisEWMA() {
+        if (trackQueueLatencyEWMA == false) {
+            return 0;
+        }
+        return queueLatencyMillisEWMA.getAverage();
+    }
+
     /**
-     * Returns the fraction of the maximum possible thread time that was actually used since the last time
-     * this method was called.
+     * Returns the fraction of the maximum possible thread time that was actually used since the last time this method was called.
+     * One of the two boolean parameters must be true, while the other false. There are two periodic pulling mechanisms that access
+     * utilization reporting.
      *
-     * @return the utilization as a fraction, in the range [0, 1]
+     * @return the utilization as a fraction, in the range [0, 1]. This may return >1 if a task completed in the time range but started
+     * earlier, contributing a larger execution time.
      */
-    public double pollUtilization() {
+    public double pollUtilization(boolean forAPM, boolean forAllocation) {
+        assert forAPM ^ forAllocation : "Can only collect one or the other, APM: " + forAPM + ", Allocation: " + forAllocation;
+
         final long currentTotalExecutionTimeNanos = totalExecutionTime.sum();
         final long currentPollTimeNanos = System.nanoTime();
 
-        final long totalExecutionTimeSinceLastPollNanos = currentTotalExecutionTimeNanos - lastTotalExecutionTime;
-        final long timeSinceLastPoll = currentPollTimeNanos - lastPollTime;
+        final long totalExecutionTimeSinceLastPollNanos = currentTotalExecutionTimeNanos - (forAPM
+            ? lastTotalExecutionTimeAPM
+            : lastTotalExecutionTimeAllocation);
+        final long timeSinceLastPoll = currentPollTimeNanos - (forAPM ? lastPollTimeAPM : lastPollTimeNanosAllocation);
         final long maximumExecutionTimeSinceLastPollNanos = timeSinceLastPoll * getMaximumPoolSize();
         final double utilizationSinceLastPoll = (double) totalExecutionTimeSinceLastPollNanos / maximumExecutionTimeSinceLastPollNanos;
 
-        lastTotalExecutionTime = currentTotalExecutionTimeNanos;
-        lastPollTime = currentPollTimeNanos;
+        if (forAPM) {
+            lastTotalExecutionTimeAPM = currentTotalExecutionTimeNanos;
+            lastPollTimeAPM = currentPollTimeNanos;
+        } else {
+            assert forAllocation;
+            lastTotalExecutionTimeAllocation = currentTotalExecutionTimeNanos;
+            lastPollTimeNanosAllocation = currentPollTimeNanos;
+        }
+
         return utilizationSinceLastPoll;
     }
 
@@ -161,12 +188,18 @@ protected void beforeExecute(Thread t, Runnable r) {
         if (trackOngoingTasks) {
             ongoingTasks.put(r, System.nanoTime());
         }
+
         assert super.unwrap(r) instanceof TimedRunnable : "expected only TimedRunnables in queue";
         final TimedRunnable timedRunnable = (TimedRunnable) super.unwrap(r);
         timedRunnable.beforeExecute();
         final long taskQueueLatency = timedRunnable.getQueueTimeNanos();
         assert taskQueueLatency >= 0;
-        queueLatencyMillisHistogram.addObservation(TimeUnit.NANOSECONDS.toMillis(taskQueueLatency));
+        var queueLatencyMillis = TimeUnit.NANOSECONDS.toMillis(taskQueueLatency);
+        queueLatencyMillisHistogram.addObservation(queueLatencyMillis);
+
+        if (trackQueueLatencyEWMA) {
+            queueLatencyMillisEWMA.addValue(queueLatencyMillis);
+        }
     }
 
     @Override
@@ -208,6 +241,9 @@ protected void appendThreadPoolExecutorDetails(StringBuilder sb) {
             .append("total task execution time = ")
             .append(TimeValue.timeValueNanos(getTotalTaskExecutionTime()))
             .append(", ");
+        if (trackQueueLatencyEWMA) {
+            sb.append("task queue EWMA = ").append(TimeValue.timeValueMillis((long) getQueuedTaskLatencyMillisEWMA())).append(", ");
+        }
     }
 
     /**
@@ -222,7 +258,17 @@ public Map<Runnable, Long> getOngoingTasks() {
     }
 
     // Used for testing
-    public double getEwmaAlpha() {
+    public double getExecutionEwmaAlpha() {
         return executionEWMA.getAlpha();
     }
+
+    // Used for testing
+    public double getQueueLatencyEwmaAlpha() {
+        return queueLatencyMillisEWMA.getAlpha();
+    }
+
+    // Used for testing
+    public boolean trackingQueueLatencyEwma() {
+        return trackQueueLatencyEWMA;
+    }
 }
@@ -21,6 +21,7 @@
 
 import static java.util.Collections.unmodifiableMap;
 import static org.elasticsearch.threadpool.ThreadPool.WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING;
+import static org.elasticsearch.threadpool.ThreadPool.WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA;
 import static org.elasticsearch.threadpool.ThreadPool.searchAutoscalingEWMA;
 
 public class DefaultBuiltInExecutorBuilders implements BuiltInExecutorBuilders {
@@ -32,6 +33,7 @@ public Map<String, ExecutorBuilder> getBuilders(Settings settings, int allocated
         final int halfProcMaxAt10 = ThreadPool.halfAllocatedProcessorsMaxTen(allocatedProcessors);
         final int genericThreadPoolMax = ThreadPool.boundedBy(4 * allocatedProcessors, 128, 512);
         final double indexAutoscalingEWMA = WRITE_THREAD_POOLS_EWMA_ALPHA_SETTING.get(settings);
+        final double queueLatencyEWMAAlpha = WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA.get(settings);
 
         Map<String, ExecutorBuilder> result = new HashMap<>();
         result.put(
@@ -55,7 +57,7 @@ public Map<String, ExecutorBuilder> getBuilders(Settings settings, int allocated
                 ThreadPool.Names.WRITE,
                 allocatedProcessors,
                 10000,
-                new EsExecutors.TaskTrackingConfig(true, indexAutoscalingEWMA)
+                new EsExecutors.TaskTrackingConfig(true, true, indexAutoscalingEWMA, queueLatencyEWMAAlpha)
             )
         );
         int searchOrGetThreadPoolSize = ThreadPool.searchOrGetThreadPoolSize(allocatedProcessors);

diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java
@@ -217,6 +217,16 @@ public static ThreadPoolType fromType(String type) {
     // moving average is 100ms, and we get one task which takes 20s the new EWMA will be ~500ms.
     public static final double DEFAULT_INDEX_AUTOSCALING_EWMA_ALPHA = 0.02;
 
+    /**
+     * If the queue latency reaches a high value (e.g. 10-30 seconds), then this thread pool is overwhelmed. It may be temporary, but that
+     * spike warrants the allocation balancer adjusting some number of shards, if possible. Therefore, it is alright to react quickly.
+     *
+     * As an example, suppose the EWMA is 10_000ms, i.e. 10 seconds.
+     * A single task in the queue that takes 30_000ms, i.e. 30 seconds, would result in a new EWMA of ~12_000ms
+     * 0.1 x 30_000ms + 0.9 x 10_000 = 3_000ms + 9_000ms = 12_000ms
+     */
+    public static final double DEFAULT_WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA = 0.1;
+
     private final Map<String, ExecutorHolder> executors;
 
     private final ThreadPoolInfo threadPoolInfo;
@@ -271,6 +281,17 @@ public Collection<ExecutorBuilder> builders() {
         Setting.Property.NodeScope
     );
 
+    /**
+     * The {@link org.elasticsearch.common.ExponentiallyWeightedMovingAverage} alpha for tracking task queue latency.
+     */
+    public static final Setting<Double> WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA = Setting.doubleSetting(
+        "thread_pool.task_tracking.queue_latency.ewma_alpha",
+        DEFAULT_WRITE_THREAD_POOL_QUEUE_LATENCY_EWMA_ALPHA,
+        0,
+        1,
+        Setting.Property.NodeScope
+    );
+
     /**
      * Defines and builds the many thread pools delineated in {@link Names}.
      *