Performance improvements.

afoucret · afoucret · commit 7cb2eaeb46b2 · 2025-12-11T11:33:14.000+01:00
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/InferenceOperator.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/InferenceOperator.java
@@ -80,7 +80,10 @@ protected void performAsync(Page input, ActionListener<OngoingInferenceResult> l
             BulkInferenceRequestItemIterator requests = requests(input);
             listener = ActionListener.releaseBefore(requests, listener);
 
-            OngoingInferenceResult result = new OngoingInferenceResult(input, new ArrayList<>());
+            // ✅ Pre-size based on estimated request count
+            int estimatedSize = requests.estimatedSize();
+            int initialCapacity = Math.max(10, Math.min(estimatedSize, 10000)); // Cap at 10k for safety
+            OngoingInferenceResult result = new OngoingInferenceResult(input, new ArrayList<>(initialCapacity));
             listener = listener.delegateResponse((l, e) -> {
                 Releasables.close(result);
                 l.onFailure(e);
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceExecutionState.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceExecutionState.java
@@ -10,8 +10,8 @@
 import org.elasticsearch.compute.operator.FailureCollector;
 import org.elasticsearch.index.seqno.LocalCheckpointTracker;
 
+import java.util.HashMap;
 import java.util.Map;
-import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicBoolean;
 
 import static org.elasticsearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED;
@@ -26,8 +26,25 @@ public class BulkInferenceExecutionState {
     private final Map<Long, BulkInferenceResponse> bufferedResponses;
     private final AtomicBoolean finished = new AtomicBoolean(false);
 
+    /**
+     * Creates a new execution state with default buffer capacity.
+     */
     public BulkInferenceExecutionState() {
-        this.bufferedResponses = new ConcurrentHashMap<>();
+        this(16);
+    }
+
+    /**
+     * Creates a new execution state with the specified initial buffer capacity.
+     * <p>
+     * The initial capacity should be sized based on the expected number of out-of-order responses.
+     * A good heuristic is to use a fraction of maxRunningTasks, as that bounds the number of
+     * concurrent in-flight responses that could arrive out-of-order.
+     * </p>
+     *
+     * @param initialCapacity The initial capacity for the response buffer
+     */
+    public BulkInferenceExecutionState(int initialCapacity) {
+        this.bufferedResponses = new HashMap<>(initialCapacity);
     }
 
     /**
@@ -68,9 +85,9 @@ public void markSeqNoAsPersisted(long seqNo) {
     }
 
     /**
-     *  Add an inference response to the buffer and marks the corresponding sequence number as processed.
+     * Buffers an inference response and marks the corresponding sequence number as processed.
      *
-     *  @param response The bulk inference response object
+     * @param response The bulk inference response object
      */
     public synchronized void onInferenceResponse(BulkInferenceResponse response) {
         if (response != null && failureCollector.hasFailure() == false) {
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceRunner.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceRunner.java
@@ -10,12 +10,12 @@
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.support.ThreadedActionListener;
 import org.elasticsearch.client.internal.Client;
-import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.inference.action.InferenceAction;
 
 import java.util.Queue;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Semaphore;
@@ -42,40 +42,27 @@ public class BulkInferenceRunner {
 
     private final Client client;
     private final Semaphore permits;
+    private final int maxRunningTasks;
     private final ExecutorService executor;
 
     /**
-     * Custom concurrent queue that prevents duplicate bulk requests from being queued.
+     * Tracks bulk requests that are currently queued to prevent duplicates.
      * <p>
-     * This queue implementation ensures fairness among multiple concurrent bulk operations
-     * by preventing the same bulk request from being queued multiple times. It uses a
-     * backing concurrent set to track which requests are already queued.
+     * This set ensures fairness among multiple concurrent bulk operations by preventing
+     * the same bulk request from being queued multiple times. Uses ConcurrentHashMap.newKeySet()
+     * for lock-free thread-safe operations.
      * </p>
      */
-    private final Queue<BulkInferenceRequest> pendingBulkRequests = new ConcurrentLinkedQueue<>() {
-        private final Set<BulkInferenceRequest> requests = ConcurrentCollections.newConcurrentSet();
+    private final Set<BulkInferenceRequest> trackedRequests = ConcurrentHashMap.newKeySet();
 
-        @Override
-        public boolean offer(BulkInferenceRequest bulkInferenceRequest) {
-            synchronized (requests) {
-                if (requests.add(bulkInferenceRequest)) {
-                    return super.offer(bulkInferenceRequest);
-                }
-                return false; // Already exists, don't add duplicate
-            }
-        }
-
-        @Override
-        public BulkInferenceRequest poll() {
-            synchronized (requests) {
-                BulkInferenceRequest request = super.poll();
-                if (request != null) {
-                    requests.remove(request);
-                }
-                return request;
-            }
-        }
-    };
+    /**
+     * Queue of pending bulk requests waiting for permit availability.
+     * <p>
+     * Works in conjunction with {@link #trackedRequests} to ensure no duplicate requests
+     * are queued while maintaining lock-free concurrent access.
+     * </p>
+     */
+    private final Queue<BulkInferenceRequest> pendingBulkRequests = new ConcurrentLinkedQueue<>();
 
     /**
      * Constructs a new throttled inference runner with the specified configuration.
@@ -85,6 +72,7 @@ public BulkInferenceRequest poll() {
      */
     public BulkInferenceRunner(Client client, int maxRunningTasks) {
         this.permits = new Semaphore(maxRunningTasks);
+        this.maxRunningTasks = maxRunningTasks;
         this.client = client;
         this.executor = client.threadPool().executor(ThreadPool.Names.SEARCH);
     }
@@ -142,7 +130,7 @@ private class BulkInferenceRequest {
         private final Consumer<BulkInferenceResponse> responseConsumer;
         private final ActionListener<Void> completionListener;
 
-        private final BulkInferenceExecutionState executionState = new BulkInferenceExecutionState();
+        private final BulkInferenceExecutionState executionState;
         private final AtomicBoolean responseSent = new AtomicBoolean(false);
 
         BulkInferenceRequest(
@@ -153,6 +141,15 @@ private class BulkInferenceRequest {
             this.requests = requests;
             this.responseConsumer = responseConsumer;
             this.completionListener = completionListener;
+
+            // Initialize buffer capacity based on expected out-of-order responses.
+            // Use the minimum of:
+            // 1. Half of maxRunningTasks (typical out-of-order buffer size with good network conditions)
+            // 2. Estimated request size (if smaller, cap at that)
+            // This balances memory efficiency with avoiding rehashing for typical workloads.
+            int estimatedSize = requests.estimatedSize();
+            int bufferCapacity = Math.max(1, Math.min(estimatedSize, maxRunningTasks) / 2);
+            this.executionState = new BulkInferenceExecutionState(bufferCapacity);
         }
 
         /**
@@ -180,7 +177,7 @@ private BulkInferenceRequestItem pollPendingRequest() {
          * This method implements a continuation-based asynchronous pattern with the following features:
          * - Queue-based fairness: Multiple bulk requests can be queued and processed fairly
          * - Permit-based concurrency control: Limits concurrent inference requests using semaphores
-         * - Hybrid recursion strategy: Uses direct recursion for performance up to 100 levels,
+         * - Hybrid recursion strategy: Uses direct recursion for performance up to 500 levels,
          * then switches to executor-based continuation to prevent stack overflow
          * - Duplicate prevention: Custom queue prevents the same bulk request from being queued multiple times
          * </p>
@@ -191,7 +188,7 @@ private BulkInferenceRequestItem pollPendingRequest() {
          * 3. Polls for the next available request from the iterator
          * 4. If no requests available, schedules the next queued bulk request
          * 5. Executes the request asynchronously with proper continuation handling
-         * 6. Uses hybrid recursion: direct calls up to 100 levels, executor-based beyond that
+         * 6. Uses hybrid recursion: direct calls up to 500 levels, executor-based beyond that
          * </p>
          * <p>
          * The loop terminates when:
@@ -209,7 +206,10 @@ private void executePendingRequests(int recursionDepth) {
                 while (executionState.finished() == false) {
                     if (permits.tryAcquire() == false) {
                         if (requests.hasNext()) {
-                            pendingBulkRequests.add(this);
+                            // Add to tracking set first to prevent duplicates
+                            if (trackedRequests.add(this)) {
+                                pendingBulkRequests.offer(this);
+                            }
                         }
                         return;
                     } else {
@@ -228,6 +228,10 @@ private void executePendingRequests(int recursionDepth) {
                             }
 
                             if (nexBulkRequest != null) {
+                                // Remove from tracking set since we're about to process it
+                                trackedRequests.remove(nexBulkRequest);
+                                // Execute the next bulk request with reset recursion depth
+                                // Use final variable for lambda capture
                                 executor.execute(nexBulkRequest::executePendingRequests);
                             }
 
@@ -242,49 +246,54 @@ private void executePendingRequests(int recursionDepth) {
 
                         final ActionListener<InferenceAction.Response> inferenceResponseListener = new ThreadedActionListener<>(
                             executor,
-                            ActionListener.runAfter(
-                                ActionListener.wrap(
-                                    r -> executionState.onInferenceResponse(new BulkInferenceResponse(bulkInferenceRequestItem, r)),
-                                    e -> executionState.onInferenceException(bulkInferenceRequestItem.seqNo(), e)
-                                ),
-                                () -> {
-                                    // Release the permit we used
-                                    permits.release();
-
-                                    try {
-                                        synchronized (executionState) {
-                                            persistPendingResponses();
-                                        }
+                            ActionListener.runAfter(ActionListener.wrap(r -> {
+                                BulkInferenceResponse bulkResponse = new BulkInferenceResponse(bulkInferenceRequestItem, r);
+                                executionState.onInferenceResponse(bulkResponse);
+                            }, e -> executionState.onInferenceException(bulkInferenceRequestItem.seqNo(), e)), () -> {
+                                // Release the permit we used
+                                permits.release();
+
+                                try {
+                                    synchronized (executionState) {
+                                        persistPendingResponses();
+                                    }
 
-                                        if (executionState.finished() && responseSent.compareAndSet(false, true)) {
-                                            onBulkCompletion();
-                                        }
+                                    if (executionState.finished() && responseSent.compareAndSet(false, true)) {
+                                        onBulkCompletion();
+                                    }
 
-                                        if (responseSent.get()) {
-                                            // Response has already been sent
-                                            // No need to continue processing this bulk.
-                                            // Check if another bulk request is pending for execution.
-                                            BulkInferenceRequest nexBulkRequest = pendingBulkRequests.poll();
-                                            if (nexBulkRequest != null) {
-                                                executor.execute(nexBulkRequest::executePendingRequests);
-                                            }
-                                            return;
+                                    if (responseSent.get()) {
+                                        // Response has already been sent
+                                        // No need to continue processing this bulk.
+                                        // Check if another bulk request is pending for execution.
+                                        BulkInferenceRequest nexBulkRequest = pendingBulkRequests.poll();
+                                        if (nexBulkRequest != null) {
+                                            // Remove from tracking set since we're about to process it
+                                            trackedRequests.remove(nexBulkRequest);
+                                            // Execute the next bulk request with reset recursion depth
+                                            // Use final variable for lambda capture
+                                            executor.execute(nexBulkRequest::executePendingRequests);
                                         }
-                                        if (executionState.finished() == false) {
-                                            // Execute any pending requests if any
-                                            if (recursionDepth > 100) {
-                                                executor.execute(this::executePendingRequests);
-                                            } else {
-                                                this.executePendingRequests(recursionDepth + 1);
-                                            }
-                                        }
-                                    } catch (Exception e) {
-                                        if (responseSent.compareAndSet(false, true)) {
-                                            completionListener.onFailure(e);
+                                        return;
+                                    }
+                                    if (executionState.finished() == false) {
+                                        // Execute any pending requests if any
+                                        if (recursionDepth > 500) {
+                                            // Reset recursion depth by submitting to executor
+                                            // This prevents unbounded stack growth while maintaining performance
+                                            executor.execute(this::executePendingRequests);
+                                        } else {
+                                            this.executePendingRequests(recursionDepth + 1);
                                         }
                                     }
+                                } catch (Exception e) {
+                                    if (responseSent.compareAndSet(false, true)) {
+                                        // Clean up tracking set before notifying failure
+                                        trackedRequests.remove(BulkInferenceRequest.this);
+                                        completionListener.onFailure(e);
+                                    }
                                 }
-                            )
+                            })
                         );
 
                         // Handle null requests (edge case in some iterators)
@@ -305,6 +314,8 @@ private void executePendingRequests(int recursionDepth) {
                 }
             } catch (Exception e) {
                 executionState.addFailure(e);
+                // Ensure cleanup on exception - remove from tracking set to prevent memory leak
+                trackedRequests.remove(this);
             }
         }
 
@@ -324,7 +335,9 @@ private void persistPendingResponses() {
                 if (executionState.hasFailure() == false) {
                     try {
                         BulkInferenceResponse response = executionState.fetchBufferedResponse(persistedSeqNo);
-                        responseConsumer.accept(response);
+                        if (response != null) {
+                            responseConsumer.accept(response);
+                        }
                     } catch (Exception e) {
                         executionState.addFailure(e);
                     }
@@ -335,18 +348,28 @@ private void persistPendingResponses() {
 
         /**
          * Call the completion listener when all requests have completed.
+         * Also ensures cleanup of this request from tracking structures to prevent memory leaks.
          */
         private void onBulkCompletion() {
-            if (executionState.hasFailure() == false) {
-                try {
-                    completionListener.onResponse(null);
-                    return;
-                } catch (Exception e) {
-                    executionState.addFailure(e);
+            try {
+                // Clean up tracking - remove this request from the tracking set
+                // in case it was queued but never processed
+                trackedRequests.remove(this);
+
+                if (executionState.hasFailure() == false) {
+                    try {
+                        completionListener.onResponse(null);
+                        return;
+                    } catch (Exception e) {
+                        executionState.addFailure(e);
+                    }
                 }
-            }
 
-            completionListener.onFailure(executionState.getFailure());
+                completionListener.onFailure(executionState.getFailure());
+            } finally {
+                // Ensure we're removed even if completion listener throws
+                trackedRequests.remove(this);
+            }
         }
     }
 
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/completion/CompletionInferenceRequestIterator.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/completion/CompletionInferenceRequestIterator.java
@@ -35,6 +35,9 @@ class CompletionInferenceRequestIterator implements BulkInferenceRequestItemIter
 
     private int currentPos = 0;
 
+    private static final int[] SHAPE_SINGLE_ONE = new int[] { 1 };
+    private static final int[] SHAPE_SINGLE_ZERO = new int[] { 0 };
+
     /**
      * Constructs a new iterator from the given block of prompts.
      *
@@ -77,8 +80,7 @@ public BulkInferenceRequestItem next() {
         }
 
         // Create shape array of exact size
-        int[] shape = Arrays.copyOf(shapeBuffer, shapeSize);
-        return new BulkInferenceRequestItem(inferenceRequest(nextPrompt), shape);
+        return new BulkInferenceRequestItem(inferenceRequest(nextPrompt), createShape());
     }
 
     private void addToShape(int value) {
@@ -89,6 +91,14 @@ private void addToShape(int value) {
         shapeBuffer[shapeSize++] = value;
     }
 
+    private int[] createShape() {
+        if (shapeSize == 1) {
+            return shapeBuffer[0] == 1 ? SHAPE_SINGLE_ONE : SHAPE_SINGLE_ZERO;
+        }
+
+        return Arrays.copyOf(shapeBuffer, shapeSize);
+    }
+
     /**
      * Wraps a single prompt string into an {@link InferenceAction.Request}.
      */
diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/textembedding/TextEmbeddingOperatorOutputBuilder.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/inference/textembedding/TextEmbeddingOperatorOutputBuilder.java
@@ -105,7 +105,7 @@ private static float[] getEmbeddingAsFloatArray(DenseEmbeddingResults.Embedding<
     private static float[] toFloatArray(byte[] values) {
         float[] floatArray = new float[values.length];
         for (int i = 0; i < values.length; i++) {
-            floatArray[i] = ((Byte) values[i]).floatValue();
+            floatArray[i] = (float) values[i];
         }
         return floatArray;
     }
diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceRunnerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/inference/bulk/BulkInferenceRunnerTests.java

Original file line number	Diff line number	Diff line change
`@@ -105,7 +105,7 @@ private static float[] getEmbeddingAsFloatArray(DenseEmbeddingResults.Embedding<`
`105`	`105`	`private static float[] toFloatArray(byte[] values) {`
`106`	`106`	`float[] floatArray = new float[values.length];`
`107`	`107`	`for (int i = 0; i < values.length; i++) {`
`108`		`- floatArray[i] = ((Byte) values[i]).floatValue();`
	`108`	`+ floatArray[i] = (float) values[i];`
`109`	`109`	`}`
`110`	`110`	`return floatArray;`
`111`	`111`	`}`