Skip to content

Commit 8b47105

Browse files
committed
Optimize memory usage in ShardBulkInferenceActionFilter
This refactor improves memory efficiency by processing inference requests in batches, capped by a max input length. Changes include: - A new dynamic operator setting to control the maximum batch size in bytes. - Dropping input data from inference responses when the legacy semantic text format isn’t used, saving memory. - Clearing inference results dynamically after each bulk item to free up memory sooner. This is a step toward enabling circuit breakers to better handle memory usage when dealing with large inputs.
1 parent 1b2f565 commit 8b47105

File tree

6 files changed

+298
-235
lines changed

6 files changed

+298
-235
lines changed

x-pack/plugin/inference/src/internalClusterTest/java/org/elasticsearch/xpack/inference/action/filter/ShardBulkInferenceActionFilterIT.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.action.update.UpdateRequestBuilder;
2121
import org.elasticsearch.cluster.metadata.IndexMetadata;
2222
import org.elasticsearch.common.settings.Settings;
23+
import org.elasticsearch.common.unit.ByteSizeValue;
2324
import org.elasticsearch.index.IndexSettings;
2425
import org.elasticsearch.index.mapper.InferenceMetadataFieldsMapper;
2526
import org.elasticsearch.index.mapper.SourceFieldMapper;
@@ -44,6 +45,7 @@
4445
import java.util.Map;
4546
import java.util.Set;
4647

48+
import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.INDICES_INFERENCE_BATCH_SIZE;
4749
import static org.elasticsearch.xpack.inference.mapper.SemanticTextFieldTests.randomSemanticTextInput;
4850
import static org.hamcrest.Matchers.containsString;
4951
import static org.hamcrest.Matchers.equalTo;
@@ -84,7 +86,12 @@ public void setup() throws Exception {
8486

8587
@Override
8688
protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
87-
return Settings.builder().put(LicenseSettings.SELF_GENERATED_LICENSE_TYPE.getKey(), "trial").build();
89+
long batchSizeInBytes = randomLongBetween(0, ByteSizeValue.ofKb(1).getBytes());
90+
return Settings.builder()
91+
.put(otherSettings)
92+
.put(LicenseSettings.SELF_GENERATED_LICENSE_TYPE.getKey(), "trial")
93+
.put(INDICES_INFERENCE_BATCH_SIZE.getKey(), ByteSizeValue.ofBytes(batchSizeInBytes))
94+
.build();
8895
}
8996

9097
@Override

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@
139139
import java.util.function.Supplier;
140140

141141
import static java.util.Collections.singletonList;
142+
import static org.elasticsearch.xpack.inference.action.filter.ShardBulkInferenceActionFilter.INDICES_INFERENCE_BATCH_SIZE;
142143
import static org.elasticsearch.xpack.inference.common.InferenceAPIClusterAwareRateLimitingFeature.INFERENCE_API_CLUSTER_AWARE_RATE_LIMITING_FEATURE_FLAG;
143144

144145
public class InferencePlugin extends Plugin
@@ -436,6 +437,7 @@ public List<Setting<?>> getSettings() {
436437
settings.addAll(Truncator.getSettingsDefinitions());
437438
settings.addAll(RequestExecutorServiceSettings.getSettingsDefinitions());
438439
settings.add(SKIP_VALIDATE_AND_START);
440+
settings.add(INDICES_INFERENCE_BATCH_SIZE);
439441
settings.addAll(ElasticInferenceServiceSettings.getSettingsDefinitions());
440442

441443
return settings;

0 commit comments

Comments
 (0)