handle batch size as a real maximum

jimczi · jimczi · commit 6cb165b04e49 · 2025-03-07T16:29:28.000Z
diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
@@ -91,6 +91,8 @@ public class ShardBulkInferenceActionFilter implements MappedActionFilter {
     public static Setting<ByteSizeValue> INDICES_INFERENCE_BATCH_SIZE = Setting.byteSizeSetting(
         "indices.inference.batch_size",
         DEFAULT_BATCH_SIZE,
+        ByteSizeValue.ONE,
+        ByteSizeValue.ofBytes(100),
         Setting.Property.NodeScope,
         Setting.Property.OperatorDynamic
     );
@@ -170,6 +172,7 @@ private record InferenceProvider(InferenceService service, Model model) {}
      * @param offsetAdjustment The adjustment to apply to the chunk text offsets.
      */
     private record FieldInferenceRequest(
+        String inferenceId,
         int bulkItemIndex,
         String field,
         String sourceField,
@@ -249,21 +252,32 @@ private void executeNext(int itemOffset) {
             }
 
             var items = bulkShardRequest.items();
-            Map<String, List<FieldInferenceRequest>> fieldRequestsMap = new HashMap<>();
+            Map<String, List<FieldInferenceRequest>> requestsMap = new HashMap<>();
             long totalInputLength = 0;
             int itemIndex = itemOffset;
-            for (; itemIndex < bulkShardRequest.items().length; itemIndex++) {
+
+            while (itemIndex < items.length && totalInputLength < batchSizeInBytes) {
                 var item = items[itemIndex];
-                totalInputLength += addFieldInferenceRequests(item, itemIndex, fieldRequestsMap);
-                if (totalInputLength >= batchSizeInBytes) {
+                var requests = createFieldInferenceRequests(item, itemIndex);
+
+                totalInputLength += requests.stream().mapToLong(r -> r.input.length()).sum();
+                if (requestsMap.size() > 0 && totalInputLength >= batchSizeInBytes) {
+                    /**
+                     * Exits early because the new requests exceed the allowable size.
+                     * These requests will be processed in the next iteration.
+                     */
                     break;
                 }
+
+                for (var request : requests) {
+                    requestsMap.computeIfAbsent(request.inferenceId, k -> new ArrayList<>()).add(request);
+                }
+                itemIndex++;
             }
-            int nextItemIndex = itemIndex + 1;
+            int nextItemOffset = itemIndex;
             Runnable onInferenceCompletion = () -> {
                 try {
-                    int limit = Math.min(nextItemIndex, items.length);
-                    for (int i = itemOffset; i < limit; i++) {
+                    for (int i = itemOffset; i < nextItemOffset; i++) {
                         var result = inferenceResults.get(i);
                         if (result == null) {
                             continue;
@@ -278,12 +292,12 @@ private void executeNext(int itemOffset) {
                         inferenceResults.set(i, null);
                     }
                 } finally {
-                    executeNext(nextItemIndex);
+                    executeNext(nextItemOffset);
                 }
             };
 
             try (var releaseOnFinish = new RefCountingRunnable(onInferenceCompletion)) {
-                for (var entry : fieldRequestsMap.entrySet()) {
+                for (var entry : requestsMap.entrySet()) {
                     executeChunkedInferenceAsync(entry.getKey(), null, entry.getValue(), releaseOnFinish.acquire());
                 }
             }
@@ -411,18 +425,16 @@ public void onFailure(Exception exc) {
         }
 
         /**
-         * Adds all inference requests associated with their respective inference IDs to the given {@code requestsMap}
-         * for the specified {@code item}.
+         * Returns all inference requests from the provided {@link BulkItemRequest}.
          *
          * @param item       The bulk request item to process.
          * @param itemIndex  The position of the item within the original bulk request.
-         * @param requestsMap A map storing inference requests, where each key is an inference ID,
-         *                    and the value is a list of associated {@link FieldInferenceRequest} objects.
-         * @return The total content length of all newly added requests, or {@code 0} if no requests were added.
+         * @return The list of {@link FieldInferenceRequest} associated with the item.
          */
-        private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<String, List<FieldInferenceRequest>> requestsMap) {
+        private List<FieldInferenceRequest> createFieldInferenceRequests(BulkItemRequest item, int itemIndex) {
             boolean isUpdateRequest = false;
             final IndexRequest indexRequest;
+
             if (item.request() instanceof IndexRequest ir) {
                 indexRequest = ir;
             } else if (item.request() instanceof UpdateRequest updateRequest) {
@@ -436,16 +448,16 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                             SemanticTextFieldMapper.CONTENT_TYPE
                         )
                     );
-                    return 0;
+                    return List.of();
                 }
                 indexRequest = updateRequest.doc();
             } else {
                 // ignore delete request
-                return 0;
+                return List.of();
             }
 
             final Map<String, Object> docMap = indexRequest.sourceAsMap();
-            long inputLength = 0;
+            List<FieldInferenceRequest> requests = new ArrayList<>();
             for (var entry : fieldInferenceMap.values()) {
                 String field = entry.getName();
                 String inferenceId = entry.getInferenceId();
@@ -514,12 +526,9 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                         break;
                     }
 
-                    inputLength += values.stream().mapToLong(String::length).sum();
-
-                    List<FieldInferenceRequest> requests = requestsMap.computeIfAbsent(inferenceId, k -> new ArrayList<>());
                     int offsetAdjustment = 0;
                     for (String v : values) {
-                        requests.add(new FieldInferenceRequest(itemIndex, field, sourceField, v, order++, offsetAdjustment));
+                        requests.add(new FieldInferenceRequest(inferenceId, itemIndex, field, sourceField, v, order++, offsetAdjustment));
 
                         // When using the inference metadata fields format, all the input values are concatenated so that the
                         // chunk text offsets are expressed in the context of a single string. Calculate the offset adjustment
@@ -528,7 +537,7 @@ private long addFieldInferenceRequests(BulkItemRequest item, int itemIndex, Map<
                     }
                 }
             }
-            return inputLength;
+            return requests;
         }
 
         private FieldInferenceResponseAccumulator ensureResponseAccumulatorSlot(int id) {