Remove queue latency metric changes

nicktindall · nicktindall · commit f96674cddfda · 2025-05-01T16:42:36.000+10:00
diff --git a/docs/changelog/120488.yaml b/docs/changelog/120488.yaml
diff --git a/server/src/main/java/org/elasticsearch/common/util/concurrent/TaskExecutionTimeTrackingEsThreadPoolExecutor.java b/server/src/main/java/org/elasticsearch/common/util/concurrent/TaskExecutionTimeTrackingEsThreadPoolExecutor.java
@@ -10,17 +10,9 @@
 package org.elasticsearch.common.util.concurrent;
 
 import org.elasticsearch.common.ExponentiallyWeightedMovingAverage;
-import org.elasticsearch.common.metrics.ExponentialBucketHistogram;
 import org.elasticsearch.common.util.concurrent.EsExecutors.TaskTrackingConfig;
 import org.elasticsearch.core.TimeValue;
-import org.elasticsearch.telemetry.metric.DoubleWithAttributes;
-import org.elasticsearch.telemetry.metric.Instrument;
-import org.elasticsearch.telemetry.metric.LongWithAttributes;
-import org.elasticsearch.telemetry.metric.MeterRegistry;
-import org.elasticsearch.threadpool.ThreadPool;
-
-import java.util.Arrays;
-import java.util.List;
+
 import java.util.Map;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
@@ -30,16 +22,11 @@
 import java.util.concurrent.atomic.LongAdder;
 import java.util.function.Function;
 
-import static org.elasticsearch.threadpool.ThreadPool.THREAD_POOL_METRIC_NAME_QUEUE_TIME;
-import static org.elasticsearch.threadpool.ThreadPool.THREAD_POOL_METRIC_NAME_UTILIZATION;
-
 /**
  * An extension to thread pool executor, which tracks statistics for the task execution time.
  */
 public final class TaskExecutionTimeTrackingEsThreadPoolExecutor extends EsThreadPoolExecutor {
 
-    private static final int[] LATENCY_PERCENTILES_TO_REPORT = { 50, 90, 99 };
-
     private final Function<Runnable, WrappedRunnable> runnableWrapper;
     private final ExponentiallyWeightedMovingAverage executionEWMA;
     private final LongAdder totalExecutionTime = new LongAdder();
@@ -48,7 +35,6 @@ public final class TaskExecutionTimeTrackingEsThreadPoolExecutor extends EsThrea
     private final Map<Runnable, Long> ongoingTasks = new ConcurrentHashMap<>();
     private volatile long lastPollTime = System.nanoTime();
     private volatile long lastTotalExecutionTime = 0;
-    private final ExponentialBucketHistogram queueLatencyMillisHistogram = new ExponentialBucketHistogram();
 
     TaskExecutionTimeTrackingEsThreadPoolExecutor(
         String name,
@@ -69,34 +55,6 @@ public final class TaskExecutionTimeTrackingEsThreadPoolExecutor extends EsThrea
         this.trackOngoingTasks = trackingConfig.trackOngoingTasks();
     }
 
-    public List<Instrument> setupMetrics(MeterRegistry meterRegistry, String threadPoolName) {
-        return List.of(
-            meterRegistry.registerLongsGauge(
-                ThreadPool.THREAD_POOL_METRIC_PREFIX + threadPoolName + THREAD_POOL_METRIC_NAME_QUEUE_TIME,
-                "Time tasks spent in the queue for the " + threadPoolName + " thread pool",
-                "milliseconds",
-                () -> {
-                    List<LongWithAttributes> metricValues = Arrays.stream(LATENCY_PERCENTILES_TO_REPORT)
-                        .mapToObj(
-                            percentile -> new LongWithAttributes(
-                                queueLatencyMillisHistogram.getPercentile(percentile / 100f),
-                                Map.of("percentile", String.valueOf(percentile))
-                            )
-                        )
-                        .toList();
-                    queueLatencyMillisHistogram.clear();
-                    return metricValues;
-                }
-            ),
-            meterRegistry.registerDoubleGauge(
-                ThreadPool.THREAD_POOL_METRIC_PREFIX + threadPoolName + THREAD_POOL_METRIC_NAME_UTILIZATION,
-                "fraction of maximum thread time utilized for " + threadPoolName,
-                "fraction",
-                () -> new DoubleWithAttributes(pollUtilization(), Map.of())
-            )
-        );
-    }
-
     @Override
     protected Runnable wrapRunnable(Runnable command) {
         return super.wrapRunnable(this.runnableWrapper.apply(command));
@@ -158,12 +116,6 @@ protected void beforeExecute(Thread t, Runnable r) {
         if (trackOngoingTasks) {
             ongoingTasks.put(r, System.nanoTime());
         }
-        assert super.unwrap(r) instanceof TimedRunnable : "expected only TimedRunnables in queue";
-        final TimedRunnable timedRunnable = (TimedRunnable) super.unwrap(r);
-        timedRunnable.beforeExecute();
-        final long taskQueueLatency = timedRunnable.getQueueTimeNanos();
-        assert taskQueueLatency >= 0;
-        queueLatencyMillisHistogram.addObservation(TimeUnit.NANOSECONDS.toMillis(taskQueueLatency));
     }
 
     @Override
diff --git a/server/src/main/java/org/elasticsearch/common/util/concurrent/TimedRunnable.java b/server/src/main/java/org/elasticsearch/common/util/concurrent/TimedRunnable.java
@@ -18,7 +18,6 @@
 class TimedRunnable extends AbstractRunnable implements WrappedRunnable {
     private final Runnable original;
     private final long creationTimeNanos;
-    private long beforeExecuteTime = -1;
     private long startTimeNanos;
     private long finishTimeNanos = -1;
     private boolean failedOrRejected = false;
@@ -59,19 +58,6 @@ public boolean isForceExecution() {
         return original instanceof AbstractRunnable && ((AbstractRunnable) original).isForceExecution();
     }
 
-    /**
-     * Returns the time in nanoseconds between the creation time and the execution time
-     *
-     * @return The time in nanoseconds or -1 if the task was never de-queued
-     */
-    long getQueueTimeNanos() {
-        if (beforeExecuteTime == -1) {
-            assert false : "beforeExecute must be called before getQueueTimeNanos";
-            return -1;
-        }
-        return beforeExecuteTime - creationTimeNanos;
-    }
-
     /**
      * Return the time this task spent being run.
      * If the task is still running or has not yet been run, returns -1.
@@ -84,13 +70,6 @@ long getTotalExecutionNanos() {
         return Math.max(finishTimeNanos - startTimeNanos, 1);
     }
 
-    /**
-     * Called when the task has reached the front of the queue and is about to be executed
-     */
-    public void beforeExecute() {
-        beforeExecuteTime = System.nanoTime();
-    }
-
     /**
      * If the task was failed or rejected, return true.
      * Otherwise, false.
diff --git a/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java b/server/src/main/java/org/elasticsearch/threadpool/ThreadPool.java
@@ -31,6 +31,7 @@
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.node.Node;
 import org.elasticsearch.node.ReportingService;
+import org.elasticsearch.telemetry.metric.DoubleWithAttributes;
 import org.elasticsearch.telemetry.metric.Instrument;
 import org.elasticsearch.telemetry.metric.LongAsyncCounter;
 import org.elasticsearch.telemetry.metric.LongGauge;
@@ -153,7 +154,6 @@ public static class Names {
     public static final String THREAD_POOL_METRIC_NAME_UTILIZATION = ".threads.utilization.current";
     public static final String THREAD_POOL_METRIC_NAME_LARGEST = ".threads.largest.current";
     public static final String THREAD_POOL_METRIC_NAME_REJECTED = ".threads.rejected.total";
-    public static final String THREAD_POOL_METRIC_NAME_QUEUE_TIME = ".threads.queue.latency.histogram";
 
     public enum ThreadPoolType {
         FIXED("fixed"),
@@ -379,7 +379,14 @@ private static ArrayList<Instrument> setupMetrics(MeterRegistry meterRegistry, S
             }
 
             if (threadPoolExecutor instanceof TaskExecutionTimeTrackingEsThreadPoolExecutor timeTrackingExecutor) {
-                instruments.addAll(timeTrackingExecutor.setupMetrics(meterRegistry, name));
+                instruments.add(
+                    meterRegistry.registerDoubleGauge(
+                        prefix + THREAD_POOL_METRIC_NAME_UTILIZATION,
+                        "fraction of maximum thread time utilized for " + name,
+                        "fraction",
+                        () -> new DoubleWithAttributes(timeTrackingExecutor.pollUtilization(), at)
+                    )
+                );
             }
         }
         return instruments;
diff --git a/server/src/test/java/org/elasticsearch/common/util/concurrent/TaskExecutionTimeTrackingEsThreadPoolExecutorTests.java b/server/src/test/java/org/elasticsearch/common/util/concurrent/TaskExecutionTimeTrackingEsThreadPoolExecutorTests.java
@@ -9,27 +9,18 @@
 
 package org.elasticsearch.common.util.concurrent;
 
-import org.elasticsearch.common.metrics.ExponentialBucketHistogram;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.EsExecutors.TaskTrackingConfig;
-import org.elasticsearch.telemetry.InstrumentType;
-import org.elasticsearch.telemetry.Measurement;
-import org.elasticsearch.telemetry.RecordingMeterRegistry;
 import org.elasticsearch.test.ESTestCase;
-import org.elasticsearch.threadpool.ThreadPool;
 
-import java.util.List;
 import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.CyclicBarrier;
-import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Function;
 
 import static org.elasticsearch.common.util.concurrent.EsExecutors.TaskTrackingConfig.DEFAULT_EWMA_ALPHA;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThan;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
-import static org.hamcrest.Matchers.hasSize;
 
 /**
  * Tests for the automatic queue resizing of the {@code QueueResizingEsThreadPoolExecutorTests}
@@ -156,83 +147,6 @@ public void testGetOngoingTasks() throws Exception {
         executor.awaitTermination(10, TimeUnit.SECONDS);
     }
 
-    public void testQueueLatencyMetrics() {
-        RecordingMeterRegistry meterRegistry = new RecordingMeterRegistry();
-        final var threadPoolName = randomIdentifier();
-        var executor = new TaskExecutionTimeTrackingEsThreadPoolExecutor(
-            threadPoolName,
-            1,
-            1,
-            1000,
-            TimeUnit.MILLISECONDS,
-            ConcurrentCollections.newBlockingQueue(),
-            TimedRunnable::new,
-            EsExecutors.daemonThreadFactory("queuetest"),
-            new EsAbortPolicy(),
-            new ThreadContext(Settings.EMPTY),
-            new TaskTrackingConfig(true, DEFAULT_EWMA_ALPHA)
-        );
-        executor.setupMetrics(meterRegistry, threadPoolName);
-
-        try {
-            final var barrier = new CyclicBarrier(2);
-            final ExponentialBucketHistogram expectedHistogram = new ExponentialBucketHistogram();
-
-            /*
-             * The thread pool has a single thread, so we submit a task that will occupy that thread
-             * and cause subsequent tasks to be queued
-             */
-            Future<?> runningTask = executor.submit(() -> {
-                safeAwait(barrier);
-                safeAwait(barrier);
-            });
-            safeAwait(barrier); // wait till the first task starts
-            expectedHistogram.addObservation(0L); // the first task should not be delayed
-
-            /*
-             *  On each iteration we submit a task - which will be queued because of the
-             *  currently running task, pause for some random interval, then unblock the
-             *  new task by releasing the currently running task. This gives us a lower
-             *  bound for the real delays (the real delays will be greater than or equal
-             *  to the synthetic delays we add, i.e. each percentile should be >= our
-             *  expected values)
-             */
-            for (int i = 0; i < 10; i++) {
-                Future<?> waitingTask = executor.submit(() -> {
-                    safeAwait(barrier);
-                    safeAwait(barrier);
-                });
-                final long delayTimeMs = randomLongBetween(1, 50);
-                safeSleep(delayTimeMs);
-                safeAwait(barrier); // let the running task complete
-                safeAwait(barrier); // wait for the next task to start
-                safeGet(runningTask); // ensure previous task is complete
-                expectedHistogram.addObservation(delayTimeMs);
-                runningTask = waitingTask;
-            }
-            safeAwait(barrier); // let the last task finish
-            safeGet(runningTask);
-            meterRegistry.getRecorder().collect();
-
-            List<Measurement> measurements = meterRegistry.getRecorder()
-                .getMeasurements(
-                    InstrumentType.LONG_GAUGE,
-                    ThreadPool.THREAD_POOL_METRIC_PREFIX + threadPoolName + ThreadPool.THREAD_POOL_METRIC_NAME_QUEUE_TIME
-                );
-            assertThat(measurements, hasSize(3));
-            // we have to use greater than or equal to because the actual delay might be higher than what we imposed
-            assertThat(getPercentile(measurements, "99"), greaterThanOrEqualTo(expectedHistogram.getPercentile(0.99f)));
-            assertThat(getPercentile(measurements, "90"), greaterThanOrEqualTo(expectedHistogram.getPercentile(0.9f)));
-            assertThat(getPercentile(measurements, "50"), greaterThanOrEqualTo(expectedHistogram.getPercentile(0.5f)));
-        } finally {
-            ThreadPool.terminate(executor, 10, TimeUnit.SECONDS);
-        }
-    }
-
-    private long getPercentile(List<Measurement> measurements, String percentile) {
-        return measurements.stream().filter(m -> m.attributes().get("percentile").equals(percentile)).findFirst().orElseThrow().getLong();
-    }
-
     /**
      * The returned function outputs a WrappedRunnabled that simulates the case
      * where {@link TimedRunnable#getTotalExecutionNanos()} always returns {@code timeTakenNanos}.