Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
006bdf2
Starting refactor
jonathan-buttner Aug 19, 2025
2a7ff64
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Aug 28, 2025
fca2543
Not sending enabled field across nodes
jonathan-buttner Aug 28, 2025
80d4224
Adding transport version change
jonathan-buttner Aug 29, 2025
2ca11e2
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Aug 29, 2025
d8b841f
Removing minimum settings changes
jonathan-buttner Aug 29, 2025
97a3730
Addressing feedback
jonathan-buttner Sep 2, 2025
cee82cf
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Sep 2, 2025
dc7f53b
Rejecting rate limit field
jonathan-buttner Sep 2, 2025
df0a224
Ensure parsing from index does not throw
jonathan-buttner Sep 2, 2025
314b4ba
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Sep 2, 2025
910d317
Adding test to throw when rate limit is in request
jonathan-buttner Sep 2, 2025
6fd0c6d
Merge branch 'main' into ml-remove-eis-rl
jonathan-buttner Sep 2, 2025
48e88b3
Returning validation exception for rate limit field
jonathan-buttner Sep 3, 2025
a1d75fe
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Sep 3, 2025
04d5699
Merge branch 'ml-remove-eis-rl' of github.com:jonathan-buttner/elasti…
jonathan-buttner Sep 3, 2025
dfd9154
Merge branch 'main' of github.com:elastic/elasticsearch into ml-remov…
jonathan-buttner Sep 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ALLOCATION_DECISION_NOT_PREFERRED = def(9_145_0_00);
public static final TransportVersion ESQL_QUALIFIERS_IN_ATTRIBUTES = def(9_146_0_00);
public static final TransportVersion PROJECT_RESERVED_STATE_MOVE_TO_REGISTRY = def(9_147_0_00);
public static final TransportVersion INFERENCE_API_DISABLE_EIS_RATE_LIMITING = def(9_148_0_00);

/*
* STOP! READ THIS FIRST! No, really,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,9 +449,11 @@ public synchronized TimeValue executeEnqueuedTask() {
}

private TimeValue executeEnqueuedTaskInternal() {
var timeBeforeAvailableToken = rateLimiter.timeToReserve(1);
if (shouldExecuteImmediately(timeBeforeAvailableToken) == false) {
return timeBeforeAvailableToken;
if (rateLimitSettings.isEnabled()) {
var timeBeforeAvailableToken = rateLimiter.timeToReserve(1);
if (shouldExecuteImmediately(timeBeforeAvailableToken) == false) {
return timeBeforeAvailableToken;
}
}

var task = queue.poll();
Expand All @@ -463,9 +465,11 @@ private TimeValue executeEnqueuedTaskInternal() {
return NO_TASKS_AVAILABLE;
}

// We should never have to wait because we checked above
var reserveRes = rateLimiter.reserve(1);
assert shouldExecuteImmediately(reserveRes) : "Reserving request tokens required a sleep when it should not have";
if (rateLimitSettings.isEnabled()) {
// We should never have to wait because we checked above
var reserveRes = rateLimiter.reserve(1);
assert shouldExecuteImmediately(reserveRes) : "Reserving request tokens required a sleep when it should not have";
}

task.getRequestManager()
.execute(task.getInferenceInputs(), requestSender, task.getRequestCompletedFunction(), task.getListener());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ private static Map<String, DefaultModelConfig> initDefaultEndpoints(
DEFAULT_CHAT_COMPLETION_ENDPOINT_ID_V1,
TaskType.CHAT_COMPLETION,
NAME,
new ElasticInferenceServiceCompletionServiceSettings(DEFAULT_CHAT_COMPLETION_MODEL_ID_V1, null),
new ElasticInferenceServiceCompletionServiceSettings(DEFAULT_CHAT_COMPLETION_MODEL_ID_V1),
EmptyTaskSettings.INSTANCE,
EmptySecretSettings.INSTANCE,
elasticInferenceServiceComponents
Expand All @@ -206,7 +206,7 @@ private static Map<String, DefaultModelConfig> initDefaultEndpoints(
DEFAULT_ELSER_ENDPOINT_ID_V2,
TaskType.SPARSE_EMBEDDING,
NAME,
new ElasticInferenceServiceSparseEmbeddingsServiceSettings(DEFAULT_ELSER_2_MODEL_ID, null, null),
new ElasticInferenceServiceSparseEmbeddingsServiceSettings(DEFAULT_ELSER_2_MODEL_ID, null),
EmptyTaskSettings.INSTANCE,
EmptySecretSettings.INSTANCE,
elasticInferenceServiceComponents,
Expand All @@ -224,8 +224,7 @@ private static Map<String, DefaultModelConfig> initDefaultEndpoints(
DEFAULT_MULTILINGUAL_EMBED_MODEL_ID,
defaultDenseTextEmbeddingsSimilarity(),
null,
null,
ElasticInferenceServiceDenseTextEmbeddingsServiceSettings.DEFAULT_RATE_LIMIT_SETTINGS
null
),
EmptyTaskSettings.INSTANCE,
EmptySecretSettings.INSTANCE,
Expand All @@ -245,7 +244,7 @@ private static Map<String, DefaultModelConfig> initDefaultEndpoints(
DEFAULT_RERANK_ENDPOINT_ID_V1,
TaskType.RERANK,
NAME,
new ElasticInferenceServiceRerankServiceSettings(DEFAULT_RERANK_MODEL_ID_V1, null),
new ElasticInferenceServiceRerankServiceSettings(DEFAULT_RERANK_MODEL_ID_V1),
EmptyTaskSettings.INSTANCE,
EmptySecretSettings.INSTANCE,
elasticInferenceServiceComponents
Expand Down Expand Up @@ -622,8 +621,7 @@ public Model updateModelWithEmbeddingDetails(Model model, int embeddingSize) {
modelId,
similarityToUse,
embeddingSize,
maxInputTokens,
serviceSettings.rateLimitSettings()
maxInputTokens
);

return new ElasticInferenceServiceDenseTextEmbeddingsModel(embeddingsModel, updateServiceSettings);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ public static ElasticInferenceServiceCompletionModel of(
) {
var originalModelServiceSettings = model.getServiceSettings();
var overriddenServiceSettings = new ElasticInferenceServiceCompletionServiceSettings(
Objects.requireNonNullElse(request.model(), originalModelServiceSettings.modelId()),
originalModelServiceSettings.rateLimitSettings()
Objects.requireNonNullElse(request.model(), originalModelServiceSettings.modelId())
);

return new ElasticInferenceServiceCompletionModel(model, overriddenServiceSettings);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.elasticsearch.inference.ServiceSettings;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xpack.inference.services.ConfigurationParseContext;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceService;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceRateLimitServiceSettings;
import org.elasticsearch.xpack.inference.services.settings.FilteredXContentObject;
import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;
Expand All @@ -35,38 +34,33 @@ public class ElasticInferenceServiceCompletionServiceSettings extends FilteredXC

public static final String NAME = "elastic_inference_service_completion_service_settings";

private static final RateLimitSettings DEFAULT_RATE_LIMIT_SETTINGS = new RateLimitSettings(720L);

public static ElasticInferenceServiceCompletionServiceSettings fromMap(Map<String, Object> map, ConfigurationParseContext context) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With these changes, the context argument is no longer used, so it can be removed. This also applies to the other *ServiceSettings classes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah great point. I'm going to remove them from the *Settings classes but leave them in the models just in case we need the context for a settings class in the future. That way we don't have to plumb it through again.

ValidationException validationException = new ValidationException();

String modelId = extractRequiredString(map, MODEL_ID, ModelConfigurations.SERVICE_SETTINGS, validationException);
RateLimitSettings rateLimitSettings = RateLimitSettings.of(
map,
DEFAULT_RATE_LIMIT_SETTINGS,
validationException,
ElasticInferenceService.NAME,
context
);
RateLimitSettings.disabledRateLimiting(map);

if (validationException.validationErrors().isEmpty() == false) {
throw validationException;
}

return new ElasticInferenceServiceCompletionServiceSettings(modelId, rateLimitSettings);
return new ElasticInferenceServiceCompletionServiceSettings(modelId);
}

private final String modelId;
private final RateLimitSettings rateLimitSettings;

public ElasticInferenceServiceCompletionServiceSettings(String modelId, RateLimitSettings rateLimitSettings) {
public ElasticInferenceServiceCompletionServiceSettings(String modelId) {
this.modelId = Objects.requireNonNull(modelId);
this.rateLimitSettings = Objects.requireNonNullElse(rateLimitSettings, DEFAULT_RATE_LIMIT_SETTINGS);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;
}

public ElasticInferenceServiceCompletionServiceSettings(StreamInput in) throws IOException {
this.modelId = in.readString();
this.rateLimitSettings = new RateLimitSettings(in);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;
if (in.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
new RateLimitSettings(in);
}
}

@Override
Expand Down Expand Up @@ -110,7 +104,9 @@ protected XContentBuilder toXContentFragmentOfExposedFields(XContentBuilder buil
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(modelId);
rateLimitSettings.writeTo(out);
if (out.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
rateLimitSettings.writeTo(out);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import org.elasticsearch.inference.SimilarityMeasure;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xpack.inference.services.ConfigurationParseContext;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceService;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceRateLimitServiceSettings;
import org.elasticsearch.xpack.inference.services.settings.FilteredXContentObject;
import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;
Expand All @@ -43,8 +42,6 @@ public class ElasticInferenceServiceDenseTextEmbeddingsServiceSettings extends F

public static final String NAME = "elastic_inference_service_dense_embeddings_service_settings";

public static final RateLimitSettings DEFAULT_RATE_LIMIT_SETTINGS = new RateLimitSettings(10_000);

private final String modelId;
private final SimilarityMeasure similarity;
private final Integer dimensions;
Expand All @@ -68,13 +65,7 @@ private static ElasticInferenceServiceDenseTextEmbeddingsServiceSettings fromReq
ValidationException validationException = new ValidationException();

String modelId = extractRequiredString(map, MODEL_ID, ModelConfigurations.SERVICE_SETTINGS, validationException);
RateLimitSettings rateLimitSettings = RateLimitSettings.of(
map,
DEFAULT_RATE_LIMIT_SETTINGS,
validationException,
ElasticInferenceService.NAME,
context
);
RateLimitSettings.disabledRateLimiting(map);

SimilarityMeasure similarity = extractSimilarity(map, ModelConfigurations.SERVICE_SETTINGS, validationException);
Integer dims = removeAsType(map, DIMENSIONS, Integer.class);
Expand All @@ -84,7 +75,7 @@ private static ElasticInferenceServiceDenseTextEmbeddingsServiceSettings fromReq
throw validationException;
}

return new ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(modelId, similarity, dims, maxInputTokens, rateLimitSettings);
return new ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(modelId, similarity, dims, maxInputTokens);
}

private static ElasticInferenceServiceDenseTextEmbeddingsServiceSettings fromPersistentMap(
Expand All @@ -94,13 +85,7 @@ private static ElasticInferenceServiceDenseTextEmbeddingsServiceSettings fromPer
ValidationException validationException = new ValidationException();

String modelId = extractRequiredString(map, MODEL_ID, ModelConfigurations.SERVICE_SETTINGS, validationException);
RateLimitSettings rateLimitSettings = RateLimitSettings.of(
map,
DEFAULT_RATE_LIMIT_SETTINGS,
validationException,
ElasticInferenceService.NAME,
context
);
RateLimitSettings.disabledRateLimiting(map);

SimilarityMeasure similarity = extractSimilarity(map, ModelConfigurations.SERVICE_SETTINGS, validationException);
Integer dims = removeAsType(map, DIMENSIONS, Integer.class);
Expand All @@ -110,29 +95,32 @@ private static ElasticInferenceServiceDenseTextEmbeddingsServiceSettings fromPer
throw validationException;
}

return new ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(modelId, similarity, dims, maxInputTokens, rateLimitSettings);
return new ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(modelId, similarity, dims, maxInputTokens);
}

public ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(
String modelId,
@Nullable SimilarityMeasure similarity,
@Nullable Integer dimensions,
@Nullable Integer maxInputTokens,
RateLimitSettings rateLimitSettings
@Nullable Integer maxInputTokens
) {
this.modelId = modelId;
this.similarity = similarity;
this.dimensions = dimensions;
this.maxInputTokens = maxInputTokens;
this.rateLimitSettings = Objects.requireNonNullElse(rateLimitSettings, DEFAULT_RATE_LIMIT_SETTINGS);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;
}

public ElasticInferenceServiceDenseTextEmbeddingsServiceSettings(StreamInput in) throws IOException {
this.modelId = in.readString();
this.similarity = in.readOptionalEnum(SimilarityMeasure.class);
this.dimensions = in.readOptionalVInt();
this.maxInputTokens = in.readOptionalVInt();
this.rateLimitSettings = new RateLimitSettings(in);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;

if (in.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
new RateLimitSettings(in);
}
}

@Override
Expand Down Expand Up @@ -221,7 +209,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalEnum(SimilarityMeasure.translateSimilarity(similarity, out.getTransportVersion()));
out.writeOptionalVInt(dimensions);
out.writeOptionalVInt(maxInputTokens);
rateLimitSettings.writeTo(out);
if (out.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
rateLimitSettings.writeTo(out);
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.elasticsearch.inference.ServiceSettings;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xpack.inference.services.ConfigurationParseContext;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceService;
import org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceRateLimitServiceSettings;
import org.elasticsearch.xpack.inference.services.settings.FilteredXContentObject;
import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;
Expand All @@ -35,35 +34,30 @@ public class ElasticInferenceServiceRerankServiceSettings extends FilteredXConte

public static final String NAME = "elastic_rerank_service_settings";

private static final RateLimitSettings DEFAULT_RATE_LIMIT_SETTINGS = new RateLimitSettings(500);

public static ElasticInferenceServiceRerankServiceSettings fromMap(Map<String, Object> map, ConfigurationParseContext context) {
ValidationException validationException = new ValidationException();

String modelId = extractRequiredString(map, MODEL_ID, ModelConfigurations.SERVICE_SETTINGS, validationException);
RateLimitSettings rateLimitSettings = RateLimitSettings.of(
map,
DEFAULT_RATE_LIMIT_SETTINGS,
validationException,
ElasticInferenceService.NAME,
context
);

return new ElasticInferenceServiceRerankServiceSettings(modelId, rateLimitSettings);
RateLimitSettings.disabledRateLimiting(map);

return new ElasticInferenceServiceRerankServiceSettings(modelId);
}

private final String modelId;

private final RateLimitSettings rateLimitSettings;

public ElasticInferenceServiceRerankServiceSettings(String modelId, RateLimitSettings rateLimitSettings) {
public ElasticInferenceServiceRerankServiceSettings(String modelId) {
this.modelId = Objects.requireNonNull(modelId);
this.rateLimitSettings = Objects.requireNonNullElse(rateLimitSettings, DEFAULT_RATE_LIMIT_SETTINGS);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;
}

public ElasticInferenceServiceRerankServiceSettings(StreamInput in) throws IOException {
this.modelId = in.readString();
this.rateLimitSettings = new RateLimitSettings(in);
this.rateLimitSettings = RateLimitSettings.DISABLED_INSTANCE;
if (in.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
new RateLimitSettings(in);
}
}

@Override
Expand Down Expand Up @@ -115,7 +109,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeString(modelId);
rateLimitSettings.writeTo(out);
if (out.getTransportVersion().before(TransportVersions.INFERENCE_API_DISABLE_EIS_RATE_LIMITING)) {
rateLimitSettings.writeTo(out);
}
}

@Override
Expand Down
Loading