elastic · jonathan-buttner · Jul 18, 2025 · Jun 23, 2025 · Jun 26, 2025 · Jun 26, 2025
diff --git a/docs/changelog/130092.yaml b/docs/changelog/130092.yaml
@@ -0,0 +1,5 @@
+pr: 130092
+summary: "Added Llama provider support to the Inference Plugin"
+area: Machine Learning
+type: enhancement
+issues: []
@@ -220,6 +220,27 @@ public <T> void declareField(BiConsumer<Value, T> consumer, ContextParser<Contex
         }
     }
 
+    /**
+     * Declare a field that is an array of objects or null. Used to avoid calling the consumer when used with
+     * {@link #optionalConstructorArg()} or {@link #constructorArg()}.
+     * @param consumer Consumer that will be passed as is to the {@link #declareField(BiConsumer, ContextParser, ParseField, ValueType)}.
+     * @param objectParser Parser that will parse the objects in the array, checking for nulls.
+     * @param field Field to declare.
+     */
+    @Override
+    public <T> void declareObjectArrayOrNull(
+        BiConsumer<Value, List<T>> consumer,
+        ContextParser<Context, T> objectParser,
+        ParseField field
+    ) {
+        declareField(
+            consumer,
+            (p, c) -> p.currentToken() == XContentParser.Token.VALUE_NULL ? null : parseArray(p, c, objectParser),
+            field,
+            ValueType.OBJECT_ARRAY_OR_NULL
+        );
+    }
+
     @Override
     public <T> void declareNamedObject(
         BiConsumer<Value, T> consumer,

diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java
@@ -343,6 +343,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion ESQL_CATEGORIZE_OPTIONS = def(9_122_0_00);
     public static final TransportVersion ML_INFERENCE_AZURE_AI_STUDIO_RERANK_ADDED = def(9_123_0_00);
     public static final TransportVersion PROJECT_STATE_REGISTRY_ENTRY = def(9_124_0_00);
+    public static final TransportVersion ML_INFERENCE_LLAMA_ADDED = def(9_125_0_00);
 
     /*
      * STOP! READ THIS FIRST! No, really,

diff --git a/server/src/main/java/org/elasticsearch/inference/UnifiedCompletionRequest.java b/server/src/main/java/org/elasticsearch/inference/UnifiedCompletionRequest.java
@@ -121,7 +121,7 @@ public static Params withMaxTokensAndSkipStreamOptionsField(String modelId, Para
      * - Key: {@link #MODEL_FIELD}, Value: modelId
      * - Key: {@link #MAX_COMPLETION_TOKENS_FIELD}, Value: {@link #maxCompletionTokens()}
      */
-    public static Params withMaxCompletionTokensTokens(String modelId, Params params) {
+    public static Params withMaxCompletionTokens(String modelId, Params params) {
         return new DelegatingMapParams(
             Map.ofEntries(Map.entry(MODEL_ID_PARAM, modelId), Map.entry(MAX_TOKENS_PARAM, MAX_COMPLETION_TOKENS_FIELD)),
             params

diff --git a/...est/java/org/elasticsearch/xpack/core/inference/action/UnifiedCompletionRequestTests.java b/...est/java/org/elasticsearch/xpack/core/inference/action/UnifiedCompletionRequestTests.java
@@ -119,7 +119,7 @@ public void testParseAllFields() throws IOException {
 
             assertThat(request, is(expected));
             assertThat(
-                Strings.toString(request, UnifiedCompletionRequest.withMaxCompletionTokensTokens("gpt-4o", ToXContent.EMPTY_PARAMS)),
+                Strings.toString(request, UnifiedCompletionRequest.withMaxCompletionTokens("gpt-4o", ToXContent.EMPTY_PARAMS)),
                 is(XContentHelper.stripWhitespace(requestJson))
             );
         }

diff --git a/...nce/src/main/java/org/elasticsearch/xpack/inference/InferenceNamedWriteablesProvider.java b/...nce/src/main/java/org/elasticsearch/xpack/inference/InferenceNamedWriteablesProvider.java
@@ -106,6 +106,8 @@
 import org.elasticsearch.xpack.inference.services.jinaai.embeddings.JinaAIEmbeddingsTaskSettings;
 import org.elasticsearch.xpack.inference.services.jinaai.rerank.JinaAIRerankServiceSettings;
 import org.elasticsearch.xpack.inference.services.jinaai.rerank.JinaAIRerankTaskSettings;
+import org.elasticsearch.xpack.inference.services.llama.completion.LlamaChatCompletionServiceSettings;
+import org.elasticsearch.xpack.inference.services.llama.embeddings.LlamaEmbeddingsServiceSettings;
 import org.elasticsearch.xpack.inference.services.mistral.completion.MistralChatCompletionServiceSettings;
 import org.elasticsearch.xpack.inference.services.mistral.embeddings.MistralEmbeddingsServiceSettings;
 import org.elasticsearch.xpack.inference.services.openai.completion.OpenAiChatCompletionServiceSettings;
@@ -175,6 +177,7 @@ public static List<NamedWriteableRegistry.Entry> getNamedWriteables() {
         addJinaAINamedWriteables(namedWriteables);
         addVoyageAINamedWriteables(namedWriteables);
         addCustomNamedWriteables(namedWriteables);
+        addLlamaNamedWriteables(namedWriteables);
 
         addUnifiedNamedWriteables(namedWriteables);
 
@@ -274,8 +277,25 @@ private static void addMistralNamedWriteables(List<NamedWriteableRegistry.Entry>
                 MistralChatCompletionServiceSettings::new
             )
         );
+        // no task settings for Mistral
+    }
 
-        // note - no task settings for Mistral embeddings...
+    private static void addLlamaNamedWriteables(List<NamedWriteableRegistry.Entry> namedWriteables) {
+        namedWriteables.add(
+            new NamedWriteableRegistry.Entry(
+                ServiceSettings.class,
+                LlamaEmbeddingsServiceSettings.NAME,
+                LlamaEmbeddingsServiceSettings::new
+            )
+        );
+        namedWriteables.add(
+            new NamedWriteableRegistry.Entry(
+                ServiceSettings.class,
+                LlamaChatCompletionServiceSettings.NAME,
+                LlamaChatCompletionServiceSettings::new
+            )
+        );
+        // no task settings for Llama
     }
 
     private static void addAzureAiStudioNamedWriteables(List<NamedWriteableRegistry.Entry> namedWriteables) {

diff --git a/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java b/x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java
@@ -133,6 +133,7 @@
 import org.elasticsearch.xpack.inference.services.huggingface.elser.HuggingFaceElserService;
 import org.elasticsearch.xpack.inference.services.ibmwatsonx.IbmWatsonxService;
 import org.elasticsearch.xpack.inference.services.jinaai.JinaAIService;
+import org.elasticsearch.xpack.inference.services.llama.LlamaService;
 import org.elasticsearch.xpack.inference.services.mistral.MistralService;
 import org.elasticsearch.xpack.inference.services.openai.OpenAiService;
 import org.elasticsearch.xpack.inference.services.sagemaker.SageMakerClient;
@@ -402,6 +403,7 @@ public List<InferenceServiceExtension.Factory> getInferenceServiceFactories() {
             context -> new JinaAIService(httpFactory.get(), serviceComponents.get(), context),
             context -> new VoyageAIService(httpFactory.get(), serviceComponents.get(), context),
             context -> new DeepSeekService(httpFactory.get(), serviceComponents.get(), context),
+            context -> new LlamaService(httpFactory.get(), serviceComponents.get(), context),
             ElasticsearchInternalService::new,
             context -> new CustomService(httpFactory.get(), serviceComponents.get(), context)
         );

diff --git a/...ugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/ServiceUtils.java b/...ugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/ServiceUtils.java
@@ -17,6 +17,7 @@
 import org.elasticsearch.core.Tuple;
 import org.elasticsearch.inference.InputType;
 import org.elasticsearch.inference.Model;
+import org.elasticsearch.inference.ModelConfigurations;
 import org.elasticsearch.inference.SimilarityMeasure;
 import org.elasticsearch.inference.TaskType;
 import org.elasticsearch.rest.RestStatus;
@@ -304,6 +305,12 @@ public static String invalidSettingError(String settingName, String scope) {
         return Strings.format("[%s] does not allow the setting [%s]", scope, settingName);
     }
 
+    public static URI extractUri(Map<String, Object> map, String fieldName, ValidationException validationException) {
+        String parsedUrl = extractRequiredString(map, fieldName, ModelConfigurations.SERVICE_SETTINGS, validationException);
+
+        return convertToUri(parsedUrl, fieldName, ModelConfigurations.SERVICE_SETTINGS, validationException);
+    }
+
     public static URI convertToUri(@Nullable String url, String settingName, String settingScope, ValidationException validationException) {
         try {
             return createOptionalUri(url);

diff --git a/...e/services/elastic/request/ElasticInferenceServiceUnifiedChatCompletionRequestEntity.java b/...e/services/elastic/request/ElasticInferenceServiceUnifiedChatCompletionRequestEntity.java
@@ -28,7 +28,7 @@ public ElasticInferenceServiceUnifiedChatCompletionRequestEntity(UnifiedChatInpu
     @Override
     public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
         builder.startObject();
-        unifiedRequestEntity.toXContent(builder, UnifiedCompletionRequest.withMaxCompletionTokensTokens(modelId, params));
+        unifiedRequestEntity.toXContent(builder, UnifiedCompletionRequest.withMaxCompletionTokens(modelId, params));
         builder.endObject();
 
         return builder;

diff --git a/...va/org/elasticsearch/xpack/inference/services/huggingface/HuggingFaceServiceSettings.java b/...va/org/elasticsearch/xpack/inference/services/huggingface/HuggingFaceServiceSettings.java
@@ -31,11 +31,10 @@
 import static org.elasticsearch.xpack.inference.services.ServiceFields.MAX_INPUT_TOKENS;
 import static org.elasticsearch.xpack.inference.services.ServiceFields.SIMILARITY;
 import static org.elasticsearch.xpack.inference.services.ServiceFields.URL;
-import static org.elasticsearch.xpack.inference.services.ServiceUtils.convertToUri;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.createUri;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractOptionalPositiveInteger;
-import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractRequiredString;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractSimilarity;
+import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractUri;
 
 public class HuggingFaceServiceSettings extends FilteredXContentObject implements ServiceSettings, HuggingFaceRateLimitServiceSettings {
     public static final String NAME = "hugging_face_service_settings";
@@ -70,12 +69,6 @@ public static HuggingFaceServiceSettings fromMap(Map<String, Object> map, Config
         return new HuggingFaceServiceSettings(uri, similarityMeasure, dims, maxInputTokens, rateLimitSettings);
     }
 
-    public static URI extractUri(Map<String, Object> map, String fieldName, ValidationException validationException) {
-        String parsedUrl = extractRequiredString(map, fieldName, ModelConfigurations.SERVICE_SETTINGS, validationException);
-
-        return convertToUri(parsedUrl, fieldName, ModelConfigurations.SERVICE_SETTINGS, validationException);
-    }
-
     private final URI uri;
     private final SimilarityMeasure similarity;
     private final Integer dimensions;

diff --git a/...k/inference/services/huggingface/completion/HuggingFaceChatCompletionServiceSettings.java b/...k/inference/services/huggingface/completion/HuggingFaceChatCompletionServiceSettings.java
@@ -31,7 +31,7 @@
 import static org.elasticsearch.xpack.inference.services.ServiceFields.URL;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.createUri;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractOptionalString;
-import static org.elasticsearch.xpack.inference.services.huggingface.HuggingFaceServiceSettings.extractUri;
+import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractUri;
 
 /**
  * Settings for the Hugging Face chat completion service.

diff --git a/...ticsearch/xpack/inference/services/huggingface/elser/HuggingFaceElserServiceSettings.java b/...ticsearch/xpack/inference/services/huggingface/elser/HuggingFaceElserServiceSettings.java
@@ -28,7 +28,7 @@
 
 import static org.elasticsearch.xpack.inference.services.ServiceFields.MAX_INPUT_TOKENS;
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.createUri;
-import static org.elasticsearch.xpack.inference.services.huggingface.HuggingFaceServiceSettings.extractUri;
+import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractUri;
 
 public class HuggingFaceElserServiceSettings extends FilteredXContentObject
     implements

diff --git a/...csearch/xpack/inference/services/huggingface/rerank/HuggingFaceRerankServiceSettings.java b/...csearch/xpack/inference/services/huggingface/rerank/HuggingFaceRerankServiceSettings.java
@@ -27,7 +27,7 @@
 import java.util.Objects;
 
 import static org.elasticsearch.xpack.inference.services.ServiceUtils.createUri;
-import static org.elasticsearch.xpack.inference.services.huggingface.HuggingFaceServiceSettings.extractUri;
+import static org.elasticsearch.xpack.inference.services.ServiceUtils.extractUri;
 
 public class HuggingFaceRerankServiceSettings extends FilteredXContentObject
     implements

diff --git a/.../inference/src/main/java/org/elasticsearch/xpack/inference/services/llama/LlamaModel.java b/.../inference/src/main/java/org/elasticsearch/xpack/inference/services/llama/LlamaModel.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.services.llama;
+
+import org.elasticsearch.inference.EmptySecretSettings;
+import org.elasticsearch.inference.ModelConfigurations;
+import org.elasticsearch.inference.ModelSecrets;
+import org.elasticsearch.inference.SecretSettings;
+import org.elasticsearch.inference.ServiceSettings;
+import org.elasticsearch.xpack.inference.external.action.ExecutableAction;
+import org.elasticsearch.xpack.inference.services.RateLimitGroupingModel;
+import org.elasticsearch.xpack.inference.services.llama.action.LlamaActionVisitor;
+import org.elasticsearch.xpack.inference.services.settings.DefaultSecretSettings;
+import org.elasticsearch.xpack.inference.services.settings.RateLimitSettings;
+
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.Map;
+import java.util.Objects;
+
+/**
+ * Abstract class representing a Llama model for inference.
+ * This class extends RateLimitGroupingModel and provides common functionality for Llama models.
+ */
+public abstract class LlamaModel extends RateLimitGroupingModel {
+    protected URI uri;
+    protected RateLimitSettings rateLimitSettings;
+
+    /**
+     * Constructor for creating a LlamaModel with specified configurations and secrets.
+     *
+     * @param configurations the model configurations
+     * @param secrets the secret settings for the model
+     */
+    protected LlamaModel(ModelConfigurations configurations, ModelSecrets secrets) {
+        super(configurations, secrets);
+    }
+
+    /**
+     * Constructor for creating a LlamaModel with specified model, service settings, and secret settings.
+     * @param model the model configurations
+     * @param serviceSettings the settings for the inference service
+     */
+    protected LlamaModel(RateLimitGroupingModel model, ServiceSettings serviceSettings) {
+        super(model, serviceSettings);
+    }
+
+    public URI uri() {
+        return this.uri;
+    }
+
+    @Override
+    public RateLimitSettings rateLimitSettings() {
+        return this.rateLimitSettings;
+    }
+
+    @Override
+    public int rateLimitGroupingHash() {
+        return Objects.hash(getServiceSettings().modelId(), uri, getSecretSettings());
+    }
+
+    // Needed for testing only
+    public void setURI(String newUri) {
+        try {
+            this.uri = new URI(newUri);
+        } catch (URISyntaxException e) {
+            // swallow any error
+        }
+    }
+
+    /**
+     * Retrieves the secret settings from the provided map of secrets.
+     * If the map is null or empty, it returns an instance of EmptySecretSettings.
+     * Caused by the fact that Llama model doesn't have out of the box security settings and can be used witout authentication.
+     *
+     * @param secrets the map containing secret settings
+     * @return an instance of SecretSettings
+     */
+    protected static SecretSettings retrieveSecretSettings(Map<String, Object> secrets) {
+        return (secrets != null && secrets.isEmpty()) ? EmptySecretSettings.INSTANCE : DefaultSecretSettings.fromMap(secrets);
+    }
+
+    protected abstract ExecutableAction accept(LlamaActionVisitor creator);
+}