ldematte
diff --git a/‎docs/changelog/124732.yaml‎
Lines changed: 6 additions & 0 deletions b/‎docs/changelog/124732.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/common/util/concurrent/EsExecutors.java‎
Lines changed: 103 additions & 31 deletions b/‎server/src/main/java/org/elasticsearch/common/util/concurrent/EsExecutors.java‎
Lines changed: 103 additions & 31 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/common/util/concurrent/EsThreadPoolExecutor.java‎
Lines changed: 20 additions & 1 deletion b/‎server/src/main/java/org/elasticsearch/common/util/concurrent/EsThreadPoolExecutor.java‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎server/src/main/java/org/elasticsearch/threadpool/ScalingExecutorBuilder.java‎
Lines changed: 8 additions & 4 deletions b/‎server/src/main/java/org/elasticsearch/threadpool/ScalingExecutorBuilder.java‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎server/src/test/java/org/elasticsearch/action/support/RefCountingRunnableTests.java‎
Lines changed: 1 addition & 1 deletion b/‎server/src/test/java/org/elasticsearch/action/support/RefCountingRunnableTests.java‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,6 @@
+pr: 124732
+summary: Prevent rare starvation bug when using scaling `EsThreadPoolExecutor` with empty core pool size.
+area: Infra/Core
+type: bug
+issues:
+  - 124667
@@ -96,6 +96,21 @@ public static PrioritizedEsThreadPoolExecutor newSinglePrioritizing(
         return new PrioritizedEsThreadPoolExecutor(name, 1, 1, 0L, TimeUnit.MILLISECONDS, threadFactory, contextHolder, timer);
     }
 
+    /**
+     * Creates a scaling {@link EsThreadPoolExecutor} using an unbounded work queue.
+     * <p>
+     * The {@link EsThreadPoolExecutor} scales the same way as a regular {@link ThreadPoolExecutor} until the core pool size
+     * (and at least 1) is reached: each time a task is submitted a new worker is added regardless if an idle worker is available.
+     * <p>
+     * Once having reached the core pool size, a {@link ThreadPoolExecutor} will only add a new worker if the work queue rejects
+     * a task offer. Typically, using a regular unbounded queue, task offers won't ever be rejected, meaning the worker pool would never
+     * scale beyond the core pool size.
+     * <p>
+     * Scaling {@link EsThreadPoolExecutor}s use a customized unbounded {@link LinkedTransferQueue}, which rejects every task offer unless
+     * it can be immediately transferred to an available idle worker. If no such worker is available, the executor will add
+     * a new worker if capacity remains, otherwise the task is rejected and then appended to the work queue via the {@link ForceQueuePolicy}
+     * rejection handler.
+     */
     public static EsThreadPoolExecutor newScaling(
         String name,
         int min,
@@ -107,10 +122,12 @@ public static EsThreadPoolExecutor newScaling(
         ThreadContext contextHolder,
         TaskTrackingConfig config
     ) {
-        ExecutorScalingQueue<Runnable> queue = new ExecutorScalingQueue<>();
-        EsThreadPoolExecutor executor;
+        LinkedTransferQueue<Runnable> queue = newUnboundedScalingLTQueue(min, max);
+        // Force queued work via ForceQueuePolicy might starve if no worker is available (if core size is empty),
+        // probing the worker pool prevents this.
+        boolean probeWorkerPool = min == 0 && queue instanceof ExecutorScalingQueue;
         if (config.trackExecutionTime()) {
-            executor = new TaskExecutionTimeTrackingEsThreadPoolExecutor(
+            return new TaskExecutionTimeTrackingEsThreadPoolExecutor(
                 name,
                 min,
                 max,
@@ -119,27 +136,40 @@ public static EsThreadPoolExecutor newScaling(
                 queue,
                 TimedRunnable::new,
                 threadFactory,
-                new ForceQueuePolicy(rejectAfterShutdown),
+                new ForceQueuePolicy(rejectAfterShutdown, probeWorkerPool),
                 contextHolder,
                 config
             );
         } else {
-            executor = new EsThreadPoolExecutor(
+            return new EsThreadPoolExecutor(
                 name,
                 min,
                 max,
                 keepAliveTime,
                 unit,
                 queue,
                 threadFactory,
-                new ForceQueuePolicy(rejectAfterShutdown),
+                new ForceQueuePolicy(rejectAfterShutdown, probeWorkerPool),
                 contextHolder
             );
         }
-        queue.executor = executor;
-        return executor;
     }
 
+    /**
+     * Creates a scaling {@link EsThreadPoolExecutor} using an unbounded work queue.
+     * <p>
+     * The {@link EsThreadPoolExecutor} scales the same way as a regular {@link ThreadPoolExecutor} until the core pool size
+     * (and at least 1) is reached: each time a task is submitted a new worker is added regardless if an idle worker is available.
+     * <p>
+     * Once having reached the core pool size, a {@link ThreadPoolExecutor} will only add a new worker if the work queue rejects
+     * a task offer. Typically, using a regular unbounded queue, task offers won't ever be rejected, meaning the worker pool would never
+     * scale beyond the core pool size.
+     * <p>
+     * Scaling {@link EsThreadPoolExecutor}s use a customized unbounded {@link LinkedTransferQueue}, which rejects every task offer unless
+     * it can be immediately transferred to an available idle worker. If no such worker is available, the executor will add
+     * a new worker if capacity remains, otherwise the task is rejected and then appended to the work queue via the {@link ForceQueuePolicy}
+     * rejection handler.
+     */
     public static EsThreadPoolExecutor newScaling(
         String name,
         int min,
@@ -389,32 +419,58 @@ public boolean isSystem() {
      */
     private EsExecutors() {}
 
-    static class ExecutorScalingQueue<E> extends LinkedTransferQueue<E> {
+    private static <E> LinkedTransferQueue<E> newUnboundedScalingLTQueue(int corePoolSize, int maxPoolSize) {
+        if (maxPoolSize == 1 || maxPoolSize == corePoolSize) {
+            // scaling beyond core pool size (or 1) not required, use a regular unbounded LinkedTransferQueue
+            return new LinkedTransferQueue<>();
+        }
+        // scaling beyond core pool size with an unbounded queue requires ExecutorScalingQueue
+        // note, reconfiguration of core / max pool size not supported in EsThreadPoolExecutor
+        return new ExecutorScalingQueue<>();
+    }
 
-        ThreadPoolExecutor executor;
+    /**
+     * Customized {@link LinkedTransferQueue} to allow a {@link ThreadPoolExecutor} to scale beyond its core pool size despite having an
+     * unbounded queue.
+     * <p>
+     * Note, usage of unbounded work queues is a problem by itself. For once, it makes error-prone customizations necessary so that
+     * thread pools can scale up adequately. But worse, infinite queues prevent backpressure and impose a high risk of causing OOM errors.
+     * <a href="https://github.com/elastic/elasticsearch/issues/18613">Github #18613</a> captures various long outstanding, but important
+     * improvements to thread pools.
+     * <p>
+     * Once having reached its core pool size, a {@link ThreadPoolExecutor} will only add more workers if capacity remains and
+     * the task offer is rejected by the work queue. Typically that's never the case using a regular unbounded queue.
+     * <p>
+     * This customized implementation rejects every task offer unless it can be immediately transferred to an available idle worker.
+     * It relies on {@link ForceQueuePolicy} rejection handler to append the task to the work queue if no additional worker can be added
+     * and the task is rejected by the executor.
+     * <p>
+     * Note, {@link ForceQueuePolicy} cannot guarantee there will be available workers when appending tasks directly to the queue.
+     * For that reason {@link ExecutorScalingQueue} cannot be used with executors with empty core and max pool size of 1:
+     * the only available worker could time out just about at the same time as the task is appended, see
+     * <a href="https://github.com/elastic/elasticsearch/issues/124667">Github #124667</a> for more details.
+     * <p>
+     * Note, configuring executors using core = max size in combination with {@code allowCoreThreadTimeOut} could be an alternative to
+     * {@link ExecutorScalingQueue}. However, the scaling behavior would be very different: Using {@link ExecutorScalingQueue}
+     * we are able to reuse idle workers if available by means of {@link ExecutorScalingQueue#tryTransfer(Object)}.
+     * If setting core = max size, the executor will add a new worker for every task submitted until reaching the core/max pool size
+     * even if there's idle workers available.
+     */
+    static class ExecutorScalingQueue<E> extends LinkedTransferQueue<E> {
 
         ExecutorScalingQueue() {}
 
         @Override
         public boolean offer(E e) {
-            // first try to transfer to a waiting worker thread
-            if (tryTransfer(e) == false) {
-                // check if there might be spare capacity in the thread
-                // pool executor
-                int left = executor.getMaximumPoolSize() - executor.getCorePoolSize();
-                if (left > 0) {
-                    // reject queuing the task to force the thread pool
-                    // executor to add a worker if it can; combined
-                    // with ForceQueuePolicy, this causes the thread
-                    // pool to always scale up to max pool size and we
-                    // only queue when there is no spare capacity
-                    return false;
-                } else {
-                    return super.offer(e);
-                }
-            } else {
-                return true;
+            if (e == EsThreadPoolExecutor.WORKER_PROBE) { // referential equality
+                // this probe ensures a worker is available after force queueing a task via ForceQueuePolicy
+                return super.offer(e);
             }
+            // try to transfer to a waiting worker thread
+            // otherwise reject queuing the task to force the thread pool executor to add a worker if it can;
+            // combined with ForceQueuePolicy, this causes the thread pool to always scale up to max pool size
+            // so that we only queue when there is no spare capacity
+            return tryTransfer(e);
         }
 
         // Overridden to workaround a JDK bug introduced in JDK 21.0.2
@@ -456,15 +512,24 @@ static class ForceQueuePolicy extends EsRejectedExecutionHandler {
          */
         private final boolean rejectAfterShutdown;
 
+        /**
+         * Flag to indicate if the worker pool needs to be probed after force queuing a task to guarantee a worker is available.
+         */
+        private final boolean probeWorkerPool;
+
         /**
          * @param rejectAfterShutdown indicates if {@link Runnable} should be rejected once the thread pool is shutting down
          */
-        ForceQueuePolicy(boolean rejectAfterShutdown) {
+        ForceQueuePolicy(boolean rejectAfterShutdown, boolean probeWorkerPool) {
             this.rejectAfterShutdown = rejectAfterShutdown;
+            this.probeWorkerPool = probeWorkerPool;
         }
 
         @Override
         public void rejectedExecution(Runnable task, ThreadPoolExecutor executor) {
+            if (task == EsThreadPoolExecutor.WORKER_PROBE) { // referential equality
+                return;
+            }
             if (rejectAfterShutdown) {
                 if (executor.isShutdown()) {
                     reject(executor, task);
@@ -481,12 +546,19 @@ public void rejectedExecution(Runnable task, ThreadPoolExecutor executor) {
             }
         }
 
-        private static void put(ThreadPoolExecutor executor, Runnable task) {
+        private void put(ThreadPoolExecutor executor, Runnable task) {
             final BlockingQueue<Runnable> queue = executor.getQueue();
-            // force queue policy should only be used with a scaling queue
-            assert queue instanceof ExecutorScalingQueue;
+            // force queue policy should only be used with a scaling queue (ExecutorScalingQueue / LinkedTransferQueue)
+            assert queue instanceof LinkedTransferQueue;
             try {
                 queue.put(task);
+                if (probeWorkerPool && task == queue.peek()) { // referential equality
+                    // If the task is at the head of the queue, we can assume the queue was previously empty. In this case available workers
+                    // might have timed out in the meanwhile. To prevent the task from starving, we submit a noop probe to the executor.
+                    // Note, this deliberately doesn't check getPoolSize()==0 to avoid potential race conditions,
+                    // as the count in the atomic state (used by workerCountOf) is decremented first.
+                    executor.execute(EsThreadPoolExecutor.WORKER_PROBE);
+                }
             } catch (final InterruptedException e) {
                 assert false : "a scaling queue never blocks so a put to it can never be interrupted";
                 throw new AssertionError(e);
 
@@ -29,6 +29,15 @@ public class EsThreadPoolExecutor extends ThreadPoolExecutor {
 
     private static final Logger logger = LogManager.getLogger(EsThreadPoolExecutor.class);
 
+    // noop probe to prevent starvation of work in the work queue due to ForceQueuePolicy
+    // https://github.com/elastic/elasticsearch/issues/124667
+    // note, this is intentionally not a lambda to avoid this ever be turned into a compile time constant
+    // matching similar lambdas coming from other places
+    static final Runnable WORKER_PROBE = new Runnable() {
+        @Override
+        public void run() {}
+    };
+
     private final ThreadContext contextHolder;
 
     /**
@@ -66,9 +75,19 @@ public class EsThreadPoolExecutor extends ThreadPoolExecutor {
         this.contextHolder = contextHolder;
     }
 
+    @Override
+    public void setCorePoolSize(int corePoolSize) {
+        throw new UnsupportedOperationException("reconfiguration at runtime is not supported");
+    }
+
+    @Override
+    public void setMaximumPoolSize(int maximumPoolSize) {
+        throw new UnsupportedOperationException("reconfiguration at runtime is not supported");
+    }
+
     @Override
     public void execute(Runnable command) {
-        final Runnable wrappedRunnable = wrapRunnable(command);
+        final Runnable wrappedRunnable = command != WORKER_PROBE ? wrapRunnable(command) : WORKER_PROBE;
         try {
             super.execute(wrappedRunnable);
         } catch (Exception e) {
 
@@ -105,9 +105,14 @@ public ScalingExecutorBuilder(
         final EsExecutors.TaskTrackingConfig trackingConfig
     ) {
         super(name, false);
-        this.coreSetting = Setting.intSetting(settingsKey(prefix, "core"), core, Setting.Property.NodeScope);
-        this.maxSetting = Setting.intSetting(settingsKey(prefix, "max"), max, Setting.Property.NodeScope);
-        this.keepAliveSetting = Setting.timeSetting(settingsKey(prefix, "keep_alive"), keepAlive, Setting.Property.NodeScope);
+        this.coreSetting = Setting.intSetting(settingsKey(prefix, "core"), core, 0, Setting.Property.NodeScope);
+        this.maxSetting = Setting.intSetting(settingsKey(prefix, "max"), max, 1, Setting.Property.NodeScope);
+        this.keepAliveSetting = Setting.timeSetting(
+            settingsKey(prefix, "keep_alive"),
+            keepAlive,
+            TimeValue.ZERO,
+            Setting.Property.NodeScope
+        );
         this.rejectAfterShutdown = rejectAfterShutdown;
         this.trackingConfig = trackingConfig;
     }
@@ -172,5 +177,4 @@ static class ScalingExecutorSettings extends ExecutorBuilder.ExecutorSettings {
             this.keepAlive = keepAlive;
         }
     }
-
 }
@@ -100,7 +100,7 @@ public void testAsyncAcquire() throws InterruptedException {
         final var completionLatch = new CountDownLatch(1);
         final var executorService = EsExecutors.newScaling(
             "test",
-            0,
+            1,
             between(1, 10),
             10,
             TimeUnit.SECONDS,
Original file line number	Diff line number	Diff line change
`@@ -105,9 +105,14 @@ public ScalingExecutorBuilder(`
`105`	`105`	`final EsExecutors.TaskTrackingConfig trackingConfig`
`106`	`106`	`) {`
`107`	`107`	`super(name, false);`
`108`		`- this.coreSetting = Setting.intSetting(settingsKey(prefix, "core"), core, Setting.Property.NodeScope);`
`109`		`- this.maxSetting = Setting.intSetting(settingsKey(prefix, "max"), max, Setting.Property.NodeScope);`
`110`		`- this.keepAliveSetting = Setting.timeSetting(settingsKey(prefix, "keep_alive"), keepAlive, Setting.Property.NodeScope);`
	`108`	`+ this.coreSetting = Setting.intSetting(settingsKey(prefix, "core"), core, 0, Setting.Property.NodeScope);`
	`109`	`+ this.maxSetting = Setting.intSetting(settingsKey(prefix, "max"), max, 1, Setting.Property.NodeScope);`
	`110`	`+ this.keepAliveSetting = Setting.timeSetting(`
	`111`	`+ settingsKey(prefix, "keep_alive"),`
	`112`	`+ keepAlive,`
	`113`	`+ TimeValue.ZERO,`
	`114`	`+ Setting.Property.NodeScope`
	`115`	`+ );`
`111`	`116`	`this.rejectAfterShutdown = rejectAfterShutdown;`
`112`	`117`	`this.trackingConfig = trackingConfig;`
`113`	`118`	`}`
`@@ -172,5 +177,4 @@ static class ScalingExecutorSettings extends ExecutorBuilder.ExecutorSettings {`
`172`	`177`	`this.keepAlive = keepAlive;`
`173`	`178`	`}`
`174`	`179`	`}`
`175`		`-`
`176`	`180`	`}`