elastic · jimczi · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/.../java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java b/.../java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilter.java
@@ -25,7 +25,9 @@
 import org.elasticsearch.cluster.metadata.InferenceFieldMetadata;
 import org.elasticsearch.cluster.metadata.ProjectMetadata;
 import org.elasticsearch.cluster.service.ClusterService;
+import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
+import org.elasticsearch.common.bytes.CompositeBytesReference;
 import org.elasticsearch.common.settings.Setting;
 import org.elasticsearch.common.unit.ByteSizeValue;
 import org.elasticsearch.common.util.concurrent.AtomicArray;
@@ -52,6 +54,7 @@
 import org.elasticsearch.tasks.Task;
 import org.elasticsearch.xcontent.XContent;
 import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentFactory;
 import org.elasticsearch.xcontent.XContentParser;
 import org.elasticsearch.xcontent.XContentParserConfiguration;
 import org.elasticsearch.xcontent.XContentType;
@@ -469,8 +472,8 @@ private void recordRequestCountMetrics(Model model, int incrementBy, Throwable t
          * Adds all inference requests associated with their respective inference IDs to the given {@code requestsMap}
          * for the specified {@code item}.
          *
-         * @param item       The bulk request item to process.
-         * @param itemIndex  The position of the item within the original bulk request.
+         * @param item        The bulk request item to process.
+         * @param itemIndex   The position of the item within the original bulk request.
          * @param requestsMap A map storing inference requests, where each key is an inference ID,
          *                    and the value is a list of associated {@link FieldInferenceRequest} objects.
          * @return The total content length of all newly added requests, or {@code 0} if no requests were added.
@@ -671,27 +674,137 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                 );
                 inferenceFieldsMap.put(fieldName, result);
             }
-
-            BytesReference originalSource = indexRequest.source();
             if (useLegacyFormat) {
                 var newDocMap = indexRequest.sourceAsMap();
                 for (var entry : inferenceFieldsMap.entrySet()) {
                     SemanticTextUtils.insertValue(entry.getKey(), newDocMap, entry.getValue());
                 }
-                indexRequest.source(newDocMap, indexRequest.getContentType());
+                XContentBuilder builder = XContentFactory.contentBuilder(indexRequest.getContentType());
+                builder.map(newDocMap);
+                var newSource = BytesReference.bytes(builder);
+                if (incrementIndexingPressure(item, indexRequest, newSource.length())) {
+                    indexRequest.source(newSource, indexRequest.getContentType());
+                }
+            } else {
+                updateSourceWithInferenceFields(item, indexRequest, inferenceFieldsMap);
+            }
+        }
+
+        /**
+         * Updates the {@link IndexRequest}'s source to include additional inference fields.
+         * <p>
+         * If the original source uses an array-backed {@link BytesReference}, this method attempts an in-place update,
+         * reusing the existing array where possible and appending additional bytes only if needed.
+         * <p>
+         * If the original source is not array-backed, the entire source is replaced with the new source that includes
+         * the inference fields. In this case, the full size of the new source is accounted for in indexing pressure.
+         * <p>
+         * Note: We do not subtract the indexing pressure of the original source since its bytes may be pooled and not
+         * reclaimable by the garbage collector during the request lifecycle.
+         *
+         * @param item                The {@link BulkItemRequest} being processed.
+         * @param indexRequest        The {@link IndexRequest} whose source will be updated.
+         * @param inferenceFieldsMap  A map of additional fields to append to the source.
+         * @throws IOException if building the new source fails.
+         */
+        private void updateSourceWithInferenceFields(
+            BulkItemRequest item,
+            IndexRequest indexRequest,
+            Map<String, Object> inferenceFieldsMap
+        ) throws IOException {
+            var originalSource = indexRequest.source();
+            final BytesReference newSource;
+
+            // Build a new source by appending the inference fields to the existing source.
+            try (XContentBuilder builder = XContentBuilder.builder(indexRequest.getContentType().xContent())) {
+                appendSourceAndInferenceMetadata(builder, originalSource, indexRequest.getContentType(), inferenceFieldsMap);
+                newSource = BytesReference.bytes(builder);
+            }
+
+            // Calculate the additional size to account for in indexing pressure.
+            final int additionalSize = originalSource.hasArray() ? newSource.length() - originalSource.length() : newSource.length();
+
+            // If we exceed the indexing pressure limit, do not proceed with the update.
+            if (incrementIndexingPressure(item, indexRequest, additionalSize) == false) {
+                return;
+            }
+
+            // Apply the updated source to the index request.
+            if (originalSource.hasArray()) {
+                // If the original source is backed by an array, perform in-place update:
+                // - Copy as much of the new source as fits into the original array.
+                System.arraycopy(
+                    newSource.array(),
+                    newSource.arrayOffset(),
+                    originalSource.array(),
+                    originalSource.arrayOffset(),
+                    originalSource.length()
+                );
+
+                int remainingSize = newSource.length() - originalSource.length();
+                if (remainingSize > 0) {
+                    // If there are additional bytes, append them as a new BytesArray segment.
+                    byte[] remainingBytes = new byte[remainingSize];
+                    System.arraycopy(
+                        newSource.array(),
+                        newSource.arrayOffset() + originalSource.length(),
+                        remainingBytes,
+                        0,
+                        remainingSize
+                    );
+                    indexRequest.source(
+                        CompositeBytesReference.of(originalSource, new BytesArray(remainingBytes)),
+                        indexRequest.getContentType()
+                    );
+                } else {
+                    // No additional bytes; just adjust the slice length.
+                    indexRequest.source(originalSource.slice(0, newSource.length()));
+                }
             } else {
-                try (XContentBuilder builder = XContentBuilder.builder(indexRequest.getContentType().xContent())) {
-                    appendSourceAndInferenceMetadata(builder, indexRequest.source(), indexRequest.getContentType(), inferenceFieldsMap);
-                    indexRequest.source(builder);
+                // If the original source is not array-backed, replace it entirely.
+                indexRequest.source(newSource, indexRequest.getContentType());
+            }
+        }
+
+        /**
+         * Appends the original source and the new inference metadata field directly to the provided
+         * {@link XContentBuilder}, avoiding the need to materialize the original source as a {@link Map}.
+         */
+        private void appendSourceAndInferenceMetadata(
+            XContentBuilder builder,
+            BytesReference source,
+            XContentType xContentType,
+            Map<String, Object> inferenceFieldsMap
+        ) throws IOException {
+            builder.startObject();
+
+            // append the original source
+            try (
+                XContentParser parser = XContentHelper.createParserNotCompressed(XContentParserConfiguration.EMPTY, source, xContentType)
+            ) {
+                // skip start object
+                parser.nextToken();
+                while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
+                    builder.copyCurrentStructure(parser);
                 }
             }
-            long modifiedSourceSize = indexRequest.source().ramBytesUsed();
 
-            // Add the indexing pressure from the source modifications.
+            // add the inference metadata field
+            builder.field(InferenceMetadataFieldsMapper.NAME);
+            try (XContentParser parser = XContentHelper.mapToXContentParser(XContentParserConfiguration.EMPTY, inferenceFieldsMap)) {
+                builder.copyCurrentStructure(parser);
+            }
+
+            builder.endObject();
+        }
+
+        private boolean incrementIndexingPressure(BulkItemRequest item, IndexRequest indexRequest, int inc) {
             try {
-                coordinatingIndexingPressure.increment(1, modifiedSourceSize - originalSource.ramBytesUsed());
+                if (inc > 0) {
+                    coordinatingIndexingPressure.increment(1, inc);
+                }
+                return true;
             } catch (EsRejectedExecutionException e) {
-                indexRequest.source(originalSource, indexRequest.getContentType());
                 inferenceStats.bulkRejection().incrementBy(1);
                 item.abort(
                     item.index(),
@@ -702,40 +815,11 @@ private void applyInferenceResponses(BulkItemRequest item, FieldInferenceRespons
                         e
                     )
                 );
+                return false;
             }
         }
     }
 
-    /**
-     * Appends the original source and the new inference metadata field directly to the provided
-     * {@link XContentBuilder}, avoiding the need to materialize the original source as a {@link Map}.
-     */
-    private static void appendSourceAndInferenceMetadata(
-        XContentBuilder builder,
-        BytesReference source,
-        XContentType xContentType,
-        Map<String, Object> inferenceFieldsMap
-    ) throws IOException {
-        builder.startObject();
-
-        // append the original source
-        try (XContentParser parser = XContentHelper.createParserNotCompressed(XContentParserConfiguration.EMPTY, source, xContentType)) {
-            // skip start object
-            parser.nextToken();
-            while (parser.nextToken() != XContentParser.Token.END_OBJECT) {
-                builder.copyCurrentStructure(parser);
-            }
-        }
-
-        // add the inference metadata field
-        builder.field(InferenceMetadataFieldsMapper.NAME);
-        try (XContentParser parser = XContentHelper.mapToXContentParser(XContentParserConfiguration.EMPTY, inferenceFieldsMap)) {
-            builder.copyCurrentStructure(parser);
-        }
-
-        builder.endObject();
-    }
-
     static IndexRequest getIndexRequestOrNull(DocWriteRequest<?> docWriteRequest) {
         if (docWriteRequest instanceof IndexRequest indexRequest) {
             return indexRequest;

diff --git a/.../org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java b/.../org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterTests.java
@@ -131,9 +131,7 @@ public ShardBulkInferenceActionFilterTests(boolean useLegacyFormat) {
 
     @ParametersFactory
     public static Iterable<Object[]> parameters() throws Exception {
-        List<Object[]> lst = new ArrayList<>();
-        lst.add(new Object[] { true });
-        return lst;
+        return List.of(new Boolean[] { true }, new Boolean[] { false });
     }
 
     @Before
@@ -616,10 +614,7 @@ public void testIndexingPressure() throws Exception {
 
                 IndexingPressure.Coordinating coordinatingIndexingPressure = indexingPressure.getCoordinating();
                 assertThat(coordinatingIndexingPressure, notNullValue());
-                verify(coordinatingIndexingPressure, times(6)).increment(eq(1), longThat(l -> l > 0));
-                if (useLegacyFormat == false) {
-                    verify(coordinatingIndexingPressure).increment(1, longThat(l -> l > bytesUsed(doc1UpdateSource)));
-                }
+                verify(coordinatingIndexingPressure, times(useLegacyFormat ? 6 : 7)).increment(eq(1), longThat(l -> l > 0));
 
                 // Verify that the only times that increment is called are the times verified above
                 verify(coordinatingIndexingPressure, times(useLegacyFormat ? 6 : 7)).increment(anyInt(), anyLong());
@@ -668,87 +663,6 @@ public void testIndexingPressure() throws Exception {
         verify(coordinatingIndexingPressure).close();
     }
 
-    @SuppressWarnings("unchecked")
-    public void testIndexingPressureTripsOnInferenceRequestGeneration() throws Exception {
-        final InferenceStats inferenceStats = new InferenceStats(mock(), mock(), mock());
-        final InstrumentedIndexingPressure indexingPressure = new InstrumentedIndexingPressure(
-            Settings.builder().put(MAX_COORDINATING_BYTES.getKey(), "1b").build()
-        );
-        final StaticModel sparseModel = StaticModel.createRandomInstance(TaskType.SPARSE_EMBEDDING);
-        final ShardBulkInferenceActionFilter filter = createFilter(
-            threadPool,
-            Map.of(sparseModel.getInferenceEntityId(), sparseModel),
-            indexingPressure,
-            useLegacyFormat,
-            true,
-            inferenceStats
-        );
-
-        XContentBuilder doc1Source = IndexRequest.getXContentBuilder(XContentType.JSON, "sparse_field", "bar");
-
-        CountDownLatch chainExecuted = new CountDownLatch(1);
-        ActionFilterChain<BulkShardRequest, BulkShardResponse> actionFilterChain = (task, action, request, listener) -> {
-            try {
-                assertNull(request.getInferenceFieldMap());
-                assertThat(request.items().length, equalTo(3));
-
-                assertNull(request.items()[0].getPrimaryResponse());
-                assertNull(request.items()[2].getPrimaryResponse());
-
-                BulkItemRequest doc1Request = request.items()[1];
-                BulkItemResponse doc1Response = doc1Request.getPrimaryResponse();
-                assertNotNull(doc1Response);
-                assertTrue(doc1Response.isFailed());
-                BulkItemResponse.Failure doc1Failure = doc1Response.getFailure();
-                assertThat(
-                    doc1Failure.getCause().getMessage(),
-                    containsString("Unable to insert inference results into document [doc_1] due to memory pressure.")
-                );
-                assertThat(doc1Failure.getCause().getCause(), instanceOf(EsRejectedExecutionException.class));
-                assertThat(doc1Failure.getStatus(), is(RestStatus.TOO_MANY_REQUESTS));
-                verify(inferenceStats.bulkRejection()).incrementBy(1);
-
-                IndexRequest doc1IndexRequest = getIndexRequestOrNull(doc1Request.request());
-                assertThat(doc1IndexRequest, notNullValue());
-                assertThat(doc1IndexRequest.source(), equalTo(BytesReference.bytes(doc1Source)));
-
-                IndexingPressure.Coordinating coordinatingIndexingPressure = indexingPressure.getCoordinating();
-                assertThat(coordinatingIndexingPressure, notNullValue());
-                verify(coordinatingIndexingPressure).increment(eq(1), longThat(l -> l > bytesUsed(doc1Source)));
-                verify(coordinatingIndexingPressure, times(1)).increment(anyInt(), anyLong());
-
-                // Verify that the coordinating indexing pressure is maintained through downstream action filters
-                verify(coordinatingIndexingPressure, never()).close();
-
-                // Call the listener once the request is successfully processed, like is done in the production code path
-                listener.onResponse(null);
-            } finally {
-                chainExecuted.countDown();
-            }
-        };
-        ActionListener<BulkShardResponse> actionListener = (ActionListener<BulkShardResponse>) mock(ActionListener.class);
-        Task task = mock(Task.class);
-
-        Map<String, InferenceFieldMetadata> inferenceFieldMap = Map.of(
-            "sparse_field",
-            new InferenceFieldMetadata("sparse_field", sparseModel.getInferenceEntityId(), new String[] { "sparse_field" }, null)
-        );
-
-        BulkItemRequest[] items = new BulkItemRequest[3];
-        items[0] = new BulkItemRequest(0, new IndexRequest("index").id("doc_0").source("non_inference_field", "foo"));
-        items[1] = new BulkItemRequest(1, new IndexRequest("index").id("doc_1").source(doc1Source));
-        items[2] = new BulkItemRequest(2, new IndexRequest("index").id("doc_2").source("non_inference_field", "baz"));
-
-        BulkShardRequest request = new BulkShardRequest(new ShardId("test", "test", 0), WriteRequest.RefreshPolicy.NONE, items);
-        request.setInferenceFieldMap(inferenceFieldMap);
-        filter.apply(task, TransportShardBulkAction.ACTION_NAME, request, actionListener, actionFilterChain);
-        awaitLatch(chainExecuted, 10, TimeUnit.SECONDS);
-
-        IndexingPressure.Coordinating coordinatingIndexingPressure = indexingPressure.getCoordinating();
-        assertThat(coordinatingIndexingPressure, notNullValue());
-        verify(coordinatingIndexingPressure).close();
-    }
-
     @SuppressWarnings("unchecked")
     public void testIndexingPressureTripsOnInferenceResponseHandling() throws Exception {
         final XContentBuilder doc1Source = IndexRequest.getXContentBuilder(XContentType.JSON, "sparse_field", "bar");