Skip to content
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
f1e7419
Remove worst-case additional 50ms latency for non-rate limited requests
timgrein Oct 8, 2025
a326e31
Update docs/changelog/136167.yaml
timgrein Oct 8, 2025
191029f
Merge branch 'main' into es-eis-latency-issue
timgrein Oct 8, 2025
edaa0f0
Do not use forbidden API
timgrein Oct 8, 2025
57c5605
Merge remote-tracking branch 'origin/es-eis-latency-issue' into es-ei…
timgrein Oct 8, 2025
f98b82e
Merge branch 'main' into es-eis-latency-issue
timgrein Oct 8, 2025
f17ec00
Move startRequestQueueTask before start signal
timgrein Oct 8, 2025
90ee1a1
Merge remote-tracking branch 'origin/es-eis-latency-issue' into es-ei…
timgrein Oct 8, 2025
a9e7610
Cleanup in finally block
timgrein Oct 8, 2025
ec513be
Reject request on shutdown
timgrein Oct 8, 2025
174526c
Reuse rateLimitSettingsEnabled check
timgrein Oct 8, 2025
f506cb3
Add NoopTask to wake up queue on shutdown
timgrein Oct 9, 2025
ae349fd
Only add non-rate-limited tasks to fast-path request queue
timgrein Oct 9, 2025
540f49d
Extract rejection logic to common static method
timgrein Oct 9, 2025
0dca88a
Remove unnecessary cast
timgrein Oct 10, 2025
0590561
Use string placeholder in assertion
timgrein Oct 10, 2025
2e65475
Adjust test to check that a throwing task does not terminate the service
timgrein Oct 10, 2025
4fb2372
Adjust error message in general exception handler
timgrein Oct 10, 2025
2930151
Adjust warn to error
timgrein Oct 10, 2025
b2fd85f
Adjust error message when request gets rejected
timgrein Oct 13, 2025
91f387a
Rename id in RateLimitingEndpointHandler to rateLimitGroupingId
timgrein Oct 13, 2025
90e672b
Use Strings.format(...) in assertion
timgrein Oct 13, 2025
1cf24dc
Use thenAnswer instead of suppression
timgrein Oct 13, 2025
a92a7c0
Only reject requests of the respective execution path (rate-limited v…
timgrein Oct 13, 2025
8e52c22
Merge branch 'main' into es-eis-latency-issue
timgrein Oct 14, 2025
a868152
Submit only ingest embeddings requests to rate-limited execution path
timgrein Oct 14, 2025
dd53fcb
Merge remote-tracking branch 'origin/es-eis-latency-issue' into es-ei…
timgrein Oct 14, 2025
c575eba
Add rate limiting check
timgrein Oct 14, 2025
69db0e1
Make NoopTask all caps
timgrein Oct 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/changelog/136167.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 136167
summary: "[Inference API] Remove worst-case additional 50ms latency for non-rate limited\
\ requests"
area: Machine Learning
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ public static boolean isInternalTypeOrUnspecified(InputType inputType) {
return inputType == InputType.INTERNAL_INGEST || inputType == InputType.INTERNAL_SEARCH || inputType == InputType.UNSPECIFIED;
}

public static boolean isIngest(InputType inputType) {
return inputType == InputType.INGEST || inputType == InputType.INTERNAL_INGEST;
}

public static boolean isSpecified(InputType inputType) {
return inputType != null && inputType != InputType.UNSPECIFIED;
}
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ public RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit) {
}

// This should only be used for testing.
RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit, boolean enabled) {
public RateLimitSettings(long requestsPerTimeUnit, TimeUnit timeUnit, boolean enabled) {
if (requestsPerTimeUnit <= 0) {
throw new IllegalArgumentException("requests per minute must be positive");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ public static InputType randomWithInternalAndUnspecified() {
return randomFrom(InputType.INTERNAL_SEARCH, InputType.INTERNAL_INGEST, InputType.UNSPECIFIED);
}

public static InputType randomIngest() {
return randomFrom(InputType.INGEST, InputType.INTERNAL_INGEST);
}

public void testFromRestString_ValidInputType() {
for (String internal : List.of("search", "ingest", "classification", "clustering", "unspecified")) {
assertEquals(InputType.fromRestString(internal), InputType.fromString(internal));
Expand Down Expand Up @@ -211,4 +215,8 @@ public void testValidateInputTypeTranslationValues_ThrowsAnException_WhenValueIs
)
);
}

public void testIsIngest() {
assertTrue(InputType.isIngest(randomFrom(InputType.INGEST, InputType.INTERNAL_INGEST)));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,12 @@ public void testHttpRequestSender_Throws_WhenCallingSendBeforeStart() throws Exc
PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
var thrownException = expectThrows(
AssertionError.class,
() -> sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), null, listener)
() -> sender.send(
RequestManagerTests.createMockWithRateLimitingEnabled(),
new EmbeddingsInput(List.of(), null),
null,
listener
)
);
assertThat(thrownException.getMessage(), is("call start() before sending a request"));
}
Expand All @@ -375,7 +380,12 @@ public void testHttpRequestSender_Throws_WhenATimeoutOccurs() throws Exception {
sender.startSynchronously();

PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), TimeValue.timeValueNanos(1), listener);
sender.send(
RequestManagerTests.createMockWithRateLimitingEnabled(),
new EmbeddingsInput(List.of(), null),
TimeValue.timeValueNanos(1),
listener
);

var thrownException = expectThrows(ElasticsearchStatusException.class, () -> listener.actionGet(TIMEOUT));

Expand All @@ -397,7 +407,12 @@ public void testHttpRequestSenderWithTimeout_Throws_WhenATimeoutOccurs() throws
sender.startSynchronously();

PlainActionFuture<InferenceServiceResults> listener = new PlainActionFuture<>();
sender.send(RequestManagerTests.createMock(), new EmbeddingsInput(List.of(), null), TimeValue.timeValueNanos(1), listener);
sender.send(
RequestManagerTests.createMockWithRateLimitingEnabled(),
new EmbeddingsInput(List.of(), null),
TimeValue.timeValueNanos(1),
listener
);

var thrownException = expectThrows(ElasticsearchStatusException.class, () -> listener.actionGet(TIMEOUT));

Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,36 @@
import org.elasticsearch.xpack.inference.external.request.RequestTests;
import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;

import java.util.concurrent.TimeUnit;

import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

public class RequestManagerTests {
public static RequestManager createMock() {
return createMock(mock(RequestSender.class));
public static RequestManager createMockWithRateLimitingDisabled(RequestSender requestSender, String inferenceEntityId) {
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, false));
}

public static RequestManager createMockWithRateLimitingDisabled(String inferenceEntityId) {
return createMock(mock(RequestSender.class), inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, false));
}

public static RequestManager createMockWithRateLimitingEnabled() {
return createMockWithRateLimitingEnabled(mock(RequestSender.class));
}

public static RequestManager createMock(String inferenceEntityId) {
return createMock(mock(RequestSender.class), inferenceEntityId);
public static RequestManager createMockWithRateLimitingEnabled(String inferenceEntityId) {
return createMockWithRateLimitingEnabled(mock(RequestSender.class), inferenceEntityId);
}

public static RequestManager createMock(RequestSender requestSender) {
return createMock(requestSender, "id", new RateLimitSettings(1));
public static RequestManager createMockWithRateLimitingEnabled(RequestSender requestSender) {
return createMock(requestSender, "id", new RateLimitSettings(1, TimeUnit.MINUTES, true));
}

public static RequestManager createMock(RequestSender requestSender, String inferenceEntityId) {
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1));
public static RequestManager createMockWithRateLimitingEnabled(RequestSender requestSender, String inferenceEntityId) {
return createMock(requestSender, inferenceEntityId, new RateLimitSettings(1, TimeUnit.MINUTES, true));
}

public static RequestManager createMock(RequestSender requestSender, String inferenceEntityId, RateLimitSettings settings) {
Expand Down