elastic
diff --git a/‎docs/changelog/114750.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/114750.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/TransportVersions.java‎
Lines changed: 1 addition & 0 deletions b/‎server/src/main/java/org/elasticsearch/TransportVersions.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/inference/InferenceService.java‎
Lines changed: 2 additions & 2 deletions b/‎server/src/main/java/org/elasticsearch/inference/InferenceService.java‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/CreateFromDeploymentIT.java‎
Lines changed: 161 additions & 0 deletions b/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/CreateFromDeploymentIT.java‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/CustomElandModelIT.java‎
Lines changed: 24 additions & 9 deletions b/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/CustomElandModelIT.java‎
Lines changed: 24 additions & 9 deletions
diff --git a/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/InferenceBaseRestTest.java‎
Lines changed: 1 addition & 1 deletion b/‎x-pack/plugin/inference/qa/inference-service-tests/src/javaRestTest/java/org/elasticsearch/xpack/inference/InferenceBaseRestTest.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/TransportDeleteInferenceEndpointAction.java‎
Lines changed: 1 addition & 4 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/action/TransportDeleteInferenceEndpointAction.java‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/elasticsearch/BaseElasticsearchInternalService.java‎
Lines changed: 29 additions & 8 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/services/elasticsearch/BaseElasticsearchInternalService.java‎
Lines changed: 29 additions & 8 deletions
@@ -0,0 +1,5 @@
+pr: 114750
+summary: Create an ml node inference endpoint referencing an existing model
+area: Machine Learning
+type: enhancement
+issues: []
@@ -244,6 +244,7 @@ static TransportVersion def(int id) {
     public static final TransportVersion OPT_IN_ESQL_CCS_EXECUTION_INFO = def(8_768_00_0);
     public static final TransportVersion QUERY_RULE_TEST_API = def(8_769_00_0);
     public static final TransportVersion ESQL_PER_AGGREGATE_FILTER = def(8_770_00_0);
+    public static final TransportVersion ML_INFERENCE_ATTACH_TO_EXISTSING_DEPLOYMENT = def(8_771_00_0);
 
     /*
      * STOP! READ THIS FIRST! No, really,
 
@@ -129,10 +129,10 @@ void chunkedInfer(
     /**
      * Stop the model deployment.
      * The default action does nothing except acknowledge the request (true).
-     * @param modelId The ID of the model to be stopped
+     * @param unparsedModel The unparsed model configuration
      * @param listener The listener
      */
-    default void stop(String modelId, ActionListener<Boolean> listener) {
+    default void stop(UnparsedModel unparsedModel, ActionListener<Boolean> listener) {
         listener.onResponse(true);
     }
 
 
@@ -0,0 +1,161 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference;
+
+import org.elasticsearch.client.Request;
+import org.elasticsearch.client.Response;
+import org.elasticsearch.client.ResponseException;
+import org.elasticsearch.core.Strings;
+import org.elasticsearch.inference.TaskType;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+
+public class CreateFromDeploymentIT extends InferenceBaseRestTest {
+
+    @SuppressWarnings("unchecked")
+    public void testAttachToDeployment() throws IOException {
+        var modelId = "attach_to_deployment";
+        var deploymentId = "existing_deployment";
+
+        CustomElandModelIT.createMlNodeTextExpansionModel(modelId, client());
+        var response = startMlNodeDeploymemnt(modelId, deploymentId);
+        assertOkOrCreated(response);
+
+        var inferenceId = "inference_on_existing_deployment";
+        var putModel = putModel(inferenceId, endpointConfig(deploymentId), TaskType.SPARSE_EMBEDDING);
+        var serviceSettings = putModel.get("service_settings");
+        assertThat(
+            putModel.toString(),
+            serviceSettings,
+            is(Map.of("num_allocations", 1, "num_threads", 1, "model_id", "attach_to_deployment", "deployment_id", "existing_deployment"))
+        );
+
+        var results = infer(inferenceId, List.of("washing machine"));
+        assertNotNull(results.get("sparse_embedding"));
+
+        deleteModel(inferenceId);
+        // assert deployment not stopped
+        var stats = (List<Map<String, Object>>) getTrainedModelStats(modelId).get("trained_model_stats");
+        var deploymentStats = stats.get(0).get("deployment_stats");
+        assertNotNull(stats.toString(), deploymentStats);
+
+        stopMlNodeDeployment(deploymentId);
+    }
+
+    public void testAttachWithModelId() throws IOException {
+        var modelId = "attach_with_model_id";
+        var deploymentId = "existing_deployment_with_model_id";
+
+        CustomElandModelIT.createMlNodeTextExpansionModel(modelId, client());
+        var response = startMlNodeDeploymemnt(modelId, deploymentId);
+        assertOkOrCreated(response);
+
+        var inferenceId = "inference_on_existing_deployment";
+        var putModel = putModel(inferenceId, endpointConfig(modelId, deploymentId), TaskType.SPARSE_EMBEDDING);
+        var serviceSettings = putModel.get("service_settings");
+        assertThat(
+            putModel.toString(),
+            serviceSettings,
+            is(
+                Map.of(
+                    "num_allocations",
+                    1,
+                    "num_threads",
+                    1,
+                    "model_id",
+                    "attach_with_model_id",
+                    "deployment_id",
+                    "existing_deployment_with_model_id"
+                )
+            )
+        );
+
+        var results = infer(inferenceId, List.of("washing machine"));
+        assertNotNull(results.get("sparse_embedding"));
+
+        stopMlNodeDeployment(deploymentId);
+    }
+
+    public void testModelIdDoesNotMatch() throws IOException {
+        var modelId = "attach_with_model_id";
+        var deploymentId = "existing_deployment_with_model_id";
+        var aDifferentModelId = "not_the_same_as_the_one_used_in_the_deployment";
+
+        CustomElandModelIT.createMlNodeTextExpansionModel(modelId, client());
+        var response = startMlNodeDeploymemnt(modelId, deploymentId);
+        assertOkOrCreated(response);
+
+        var inferenceId = "inference_on_existing_deployment";
+        var e = expectThrows(
+            ResponseException.class,
+            () -> putModel(inferenceId, endpointConfig(aDifferentModelId, deploymentId), TaskType.SPARSE_EMBEDDING)
+        );
+        assertThat(
+            e.getMessage(),
+            containsString(
+                "Deployment [existing_deployment_with_model_id] uses model [attach_with_model_id] "
+                    + "which does not match the model [not_the_same_as_the_one_used_in_the_deployment] in the request."
+            )
+        );
+    }
+
+    private String endpointConfig(String deploymentId) {
+        return Strings.format("""
+            {
+              "service": "elasticsearch",
+              "service_settings": {
+                "deployment_id": "%s"
+              }
+            }
+            """, deploymentId);
+    }
+
+    private String endpointConfig(String modelId, String deploymentId) {
+        return Strings.format("""
+            {
+              "service": "elasticsearch",
+              "service_settings": {
+                "model_id": "%s",
+                "deployment_id": "%s"
+              }
+            }
+            """, modelId, deploymentId);
+    }
+
+    private Response startMlNodeDeploymemnt(String modelId, String deploymentId) throws IOException {
+        String endPoint = "/_ml/trained_models/"
+            + modelId
+            + "/deployment/_start?timeout=10s&wait_for=started"
+            + "&threads_per_allocation=1"
+            + "&number_of_allocations=1";
+
+        if (deploymentId != null) {
+            endPoint = endPoint + "&deployment_id=" + deploymentId;
+        }
+
+        Request request = new Request("POST", endPoint);
+        return client().performRequest(request);
+    }
+
+    protected void stopMlNodeDeployment(String deploymentId) throws IOException {
+        String endpoint = "/_ml/trained_models/" + deploymentId + "/deployment/_stop";
+        Request request = new Request("POST", endpoint);
+        request.addParameter("force", "true");
+        client().performRequest(request);
+    }
+
+    protected Map<String, Object> getTrainedModelStats(String modelId) throws IOException {
+        Request request = new Request("GET", "/_ml/trained_models/" + modelId + "/_stats");
+        return entityAsMap(client().performRequest(request));
+    }
+}
@@ -8,6 +8,7 @@
 package org.elasticsearch.xpack.inference;
 
 import org.elasticsearch.client.Request;
+import org.elasticsearch.client.RestClient;
 import org.elasticsearch.core.Strings;
 import org.elasticsearch.inference.TaskType;
 
@@ -65,11 +66,12 @@ public class CustomElandModelIT extends InferenceBaseRestTest {
     public void testSparse() throws IOException {
         String modelId = "custom-text-expansion-model";
 
-        createTextExpansionModel(modelId);
-        putModelDefinition(modelId, BASE_64_ENCODED_MODEL, RAW_MODEL_SIZE);
+        createTextExpansionModel(modelId, client());
+        putModelDefinition(modelId, BASE_64_ENCODED_MODEL, RAW_MODEL_SIZE, client());
         putVocabulary(
             List.of("these", "are", "my", "words", "the", "washing", "machine", "is", "leaking", "octopus", "comforter", "smells"),
-            modelId
+            modelId,
+            client()
         );
 
         var inferenceConfig = """
@@ -90,7 +92,7 @@ public void testSparse() throws IOException {
         assertNotNull(results.get("sparse_embedding"));
     }
 
-    protected void createTextExpansionModel(String modelId) throws IOException {
+    static void createTextExpansionModel(String modelId, RestClient client) throws IOException {
         // with_special_tokens: false for this test with limited vocab
         Request request = new Request("PUT", "/_ml/trained_models/" + modelId);
         request.setJsonEntity("""
@@ -107,10 +109,10 @@ protected void createTextExpansionModel(String modelId) throws IOException {
                  }
                }
              }""");
-        client().performRequest(request);
+        client.performRequest(request);
     }
 
-    protected void putVocabulary(List<String> vocabulary, String modelId) throws IOException {
+    static void putVocabulary(List<String> vocabulary, String modelId, RestClient client) throws IOException {
         List<String> vocabularyWithPad = new ArrayList<>();
         vocabularyWithPad.add("[PAD]");
         vocabularyWithPad.add("[UNK]");
@@ -121,14 +123,27 @@ protected void putVocabulary(List<String> vocabulary, String modelId) throws IOE
         request.setJsonEntity(Strings.format("""
             { "vocabulary": [%s] }
             """, quotedWords));
-        client().performRequest(request);
+        client.performRequest(request);
     }
 
-    protected void putModelDefinition(String modelId, String base64EncodedModel, long unencodedModelSize) throws IOException {
+    static void putModelDefinition(String modelId, String base64EncodedModel, long unencodedModelSize, RestClient client)
+        throws IOException {
         Request request = new Request("PUT", "_ml/trained_models/" + modelId + "/definition/0");
         String body = Strings.format("""
             {"total_definition_length":%s,"definition": "%s","total_parts": 1}""", unencodedModelSize, base64EncodedModel);
         request.setJsonEntity(body);
-        client().performRequest(request);
+        client.performRequest(request);
     }
+
+    // Create the model including definition and vocab
+    static void createMlNodeTextExpansionModel(String modelId, RestClient client) throws IOException {
+        createTextExpansionModel(modelId, client);
+        putModelDefinition(modelId, BASE_64_ENCODED_MODEL, RAW_MODEL_SIZE, client);
+        putVocabulary(
+            List.of("these", "are", "my", "words", "the", "washing", "machine", "is", "leaking", "octopus", "comforter", "smells"),
+            modelId,
+            client
+        );
+    }
+
 }
@@ -207,7 +207,7 @@ protected void putSemanticText(String endpointId, String searchEndpointId, Strin
     }
 
     protected Map<String, Object> putModel(String modelId, String modelConfig, TaskType taskType) throws IOException {
-        String endpoint = Strings.format("_inference/%s/%s", taskType, modelId);
+        String endpoint = Strings.format("_inference/%s/%s?error_trace", taskType, modelId);
         return putRequest(endpoint, modelConfig);
     }
 
 
@@ -9,8 +9,6 @@
 
 package org.elasticsearch.xpack.inference.action;
 
-import org.apache.logging.log4j.LogManager;
-import org.apache.logging.log4j.Logger;
 import org.elasticsearch.ElasticsearchStatusException;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.ActionRunnable;
@@ -47,7 +45,6 @@ public class TransportDeleteInferenceEndpointAction extends TransportMasterNodeA
 
     private final ModelRegistry modelRegistry;
     private final InferenceServiceRegistry serviceRegistry;
-    private static final Logger logger = LogManager.getLogger(TransportDeleteInferenceEndpointAction.class);
     private final Executor executor;
 
     @Inject
@@ -118,7 +115,7 @@ private void doExecuteForked(
 
             var service = serviceRegistry.getService(unparsedModel.service());
             if (service.isPresent()) {
-                service.get().stop(request.getInferenceEndpointId(), listener);
+                service.get().stop(unparsedModel, listener);
             } else {
                 listener.onFailure(
                     new ElasticsearchStatusException(
 
@@ -22,6 +22,7 @@
 import org.elasticsearch.inference.InputType;
 import org.elasticsearch.inference.Model;
 import org.elasticsearch.inference.TaskType;
+import org.elasticsearch.inference.UnparsedModel;
 import org.elasticsearch.xpack.core.ClientHelper;
 import org.elasticsearch.xpack.core.ml.MachineLearningField;
 import org.elasticsearch.xpack.core.ml.action.GetTrainedModelsAction;
@@ -98,6 +99,12 @@ public void start(Model model, ActionListener<Boolean> finalListener) {
                 return;
             }
 
+            if (esModel.usesExistingDeployment()) {
+                // don't start a deployment
+                finalListener.onResponse(Boolean.TRUE);
+                return;
+            }
+
             SubscribableListener.<Boolean>newForked(forkedListener -> { isBuiltinModelPut(model, forkedListener); })
                 .<Boolean>andThen((l, modelConfigExists) -> {
                     if (modelConfigExists == false) {
@@ -119,14 +126,28 @@ public void start(Model model, ActionListener<Boolean> finalListener) {
     }
 
     @Override
-    public void stop(String inferenceEntityId, ActionListener<Boolean> listener) {
-        var request = new StopTrainedModelDeploymentAction.Request(inferenceEntityId);
-        request.setForce(true);
-        client.execute(
-            StopTrainedModelDeploymentAction.INSTANCE,
-            request,
-            listener.delegateFailureAndWrap((delegatedResponseListener, response) -> delegatedResponseListener.onResponse(Boolean.TRUE))
-        );
+    public void stop(UnparsedModel unparsedModel, ActionListener<Boolean> listener) {
+
+        var model = parsePersistedConfig(unparsedModel.inferenceEntityId(), unparsedModel.taskType(), unparsedModel.settings());
+        if (model instanceof ElasticsearchInternalModel esModel) {
+
+            var serviceSettings = esModel.getServiceSettings();
+            if (serviceSettings.getDeploymentId() != null) {
+                // configured with an existing deployment so do not stop it
+                listener.onResponse(Boolean.TRUE);
+                return;
+            }
+
+            var request = new StopTrainedModelDeploymentAction.Request(esModel.mlNodeDeploymentId());
+            request.setForce(true);
+            client.execute(
+                StopTrainedModelDeploymentAction.INSTANCE,
+                request,
+                listener.delegateFailureAndWrap((delegatedResponseListener, response) -> delegatedResponseListener.onResponse(Boolean.TRUE))
+            );
+        } else {
+            listener.onFailure(notElasticsearchModelException(model));
+        }
     }
 
     protected static IllegalStateException notElasticsearchModelException(Model model) {
Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ protected void putSemanticText(String endpointId, String searchEndpointId, Strin`
`207`	`207`	`}`
`208`	`208`
`209`	`209`	`protected Map<String, Object> putModel(String modelId, String modelConfig, TaskType taskType) throws IOException {`
`210`		`- String endpoint = Strings.format("_inference/%s/%s", taskType, modelId);`
	`210`	`+ String endpoint = Strings.format("_inference/%s/%s?error_trace", taskType, modelId);`
`211`	`211`	`return putRequest(endpoint, modelConfig);`
`212`	`212`	`}`
`213`	`213`