kderusso
diff --git a/‎docs/changelog/114719.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/114719.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java‎
Lines changed: 2 additions & 1 deletion b/‎test/test-clusters/src/main/java/org/elasticsearch/test/cluster/FeatureFlag.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java‎
Lines changed: 2 additions & 5 deletions b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignmentTests.java‎
Lines changed: 7 additions & 7 deletions b/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignmentTests.java‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 133 additions & 0 deletions b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 133 additions & 0 deletions
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 34 additions & 1 deletion b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 34 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportExternalInferModelAction.java‎
Lines changed: 8 additions & 2 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportExternalInferModelAction.java‎
Lines changed: 8 additions & 2 deletions
@@ -0,0 +1,5 @@
+pr: 114719
+summary: Wait for allocation on scale up
+area: Machine Learning
+type: enhancement
+issues: []
@@ -20,7 +20,8 @@ public enum FeatureFlag {
     FAILURE_STORE_ENABLED("es.failure_store_feature_flag_enabled=true", Version.fromString("8.12.0"), null),
     SUB_OBJECTS_AUTO_ENABLED("es.sub_objects_auto_feature_flag_enabled=true", Version.fromString("8.16.0"), null),
     CHUNKING_SETTINGS_ENABLED("es.inference_chunking_settings_feature_flag_enabled=true", Version.fromString("8.16.0"), null),
-    INFERENCE_DEFAULT_ELSER("es.inference_default_elser_feature_flag_enabled=true", Version.fromString("8.16.0"), null);
+    INFERENCE_DEFAULT_ELSER("es.inference_default_elser_feature_flag_enabled=true", Version.fromString("8.16.0"), null),
+    ML_SCALE_FROM_ZERO("es.ml_scale_from_zero_feature_flag_enabled=true", Version.fromString("8.16.0"), null);
 
     public final String systemProperty;
     public final Version from;
 
@@ -224,15 +224,12 @@ public boolean hasStartedRoutes() {
         return nodeRoutingTable.values().stream().anyMatch(routeInfo -> routeInfo.getState() == RoutingState.STARTED);
     }
 
-    public List<Tuple<String, Integer>> selectRandomStartedNodesWeighedOnAllocationsForNRequests(
-        int numberOfRequests,
-        RoutingState requiredState
-    ) {
+    public List<Tuple<String, Integer>> selectRandomNodesWeighedOnAllocations(int numberOfRequests, RoutingState... acceptableStates) {
         List<String> nodeIds = new ArrayList<>(nodeRoutingTable.size());
         List<Integer> cumulativeAllocations = new ArrayList<>(nodeRoutingTable.size());
         int allocationSum = 0;
         for (Map.Entry<String, RoutingInfo> routingEntry : nodeRoutingTable.entrySet()) {
-            if (routingEntry.getValue().getState() == requiredState) {
+            if (routingEntry.getValue().getState().isAnyOf(acceptableStates)) {
                 nodeIds.add(routingEntry.getKey());
                 allocationSum += routingEntry.getValue().getCurrentAllocations();
                 cumulativeAllocations.add(allocationSum);
 
@@ -195,15 +195,15 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenNoS
         builder.addRoutingEntry("node-2", new RoutingInfo(1, 1, RoutingState.STOPPED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        assertThat(assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED).isEmpty(), is(true));
+        assertThat(assignment.selectRandomNodesWeighedOnAllocations(1, RoutingState.STARTED).isEmpty(), is(true));
     }
 
     public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenSingleStartedNode() {
         TrainedModelAssignment.Builder builder = TrainedModelAssignment.Builder.empty(randomTaskParams(5), null);
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocations(1, RoutingState.STARTED);
 
         assertThat(nodes, contains(new Tuple<>("node-1", 1)));
     }
@@ -213,7 +213,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenASh
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STOPPING);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocations(1, RoutingState.STOPPING);
 
         assertThat(nodes, empty());
     }
@@ -223,7 +223,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenASh
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STOPPING, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STOPPING);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocations(1, RoutingState.STOPPING);
 
         assertThat(nodes, contains(new Tuple<>("node-1", 1)));
     }
@@ -234,7 +234,7 @@ public void testSingleRequestWith2Nodes() {
         builder.addRoutingEntry("node-2", new RoutingInfo(1, 1, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocations(1, RoutingState.STARTED);
         assertThat(nodes, hasSize(1));
         assertEquals(nodes.get(0).v2(), Integer.valueOf(1));
     }
@@ -248,7 +248,7 @@ public void testSelectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenMul
 
         final int selectionCount = 10000;
         final CountAccumulator countsPerNodeAccumulator = new CountAccumulator();
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(selectionCount, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocations(selectionCount, RoutingState.STARTED);
 
         assertThat(nodes, hasSize(3));
         assertThat(nodes.stream().mapToInt(Tuple::v2).sum(), equalTo(selectionCount));
@@ -269,7 +269,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenMul
         builder.addRoutingEntry("node-3", new RoutingInfo(0, 0, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
         final int selectionCount = 1000;
-        var nodeCounts = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(selectionCount, RoutingState.STARTED);
+        var nodeCounts = assignment.selectRandomNodesWeighedOnAllocations(selectionCount, RoutingState.STARTED);
         assertThat(nodeCounts, hasSize(3));
 
         var selectedNodes = new HashSet<String>();
 
@@ -0,0 +1,133 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.integration;
+
+import org.apache.lucene.tests.util.LuceneTestCase;
+import org.elasticsearch.client.Response;
+import org.elasticsearch.client.ResponseListener;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+import static org.hamcrest.Matchers.empty;
+import static org.hamcrest.Matchers.hasSize;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.nullValue;
+
+@LuceneTestCase.AwaitsFix(bugUrl = "Cannot test without setting the scale to zero period to a small value")
+public class AdaptiveAllocationsScaleFromZeroIT extends PyTorchModelRestTestCase {
+
+    @SuppressWarnings("unchecked")
+    public void testScaleFromZero() throws Exception {
+        String modelId = "test_scale_from_zero";
+        createPassThroughModel(modelId);
+        putModelDefinition(modelId, PyTorchModelIT.BASE_64_ENCODED_MODEL, PyTorchModelIT.RAW_MODEL_SIZE);
+        putVocabulary(List.of("Auto", "scale", "and", "infer"), modelId);
+
+        startDeployment(modelId, modelId, new AdaptiveAllocationsSettings(true, 0, 1));
+        {
+            var responseMap = entityAsMap(getTrainedModelStats(modelId));
+            List<Map<String, Object>> stats = (List<Map<String, Object>>) responseMap.get("trained_model_stats");
+            String statusState = (String) XContentMapValues.extractValue("deployment_stats.allocation_status.state", stats.get(0));
+            assertThat(responseMap.toString(), statusState, is(not(nullValue())));
+            Integer count = (Integer) XContentMapValues.extractValue("deployment_stats.allocation_status.allocation_count", stats.get(0));
+            assertThat(responseMap.toString(), count, is(1));
+        }
+
+        // wait for scale down. The scaler service will check every 10 seconds
+        assertBusy(() -> {
+            var statsMap = entityAsMap(getTrainedModelStats(modelId));
+            List<Map<String, Object>> innerStats = (List<Map<String, Object>>) statsMap.get("trained_model_stats");
+            Integer innerCount = (Integer) XContentMapValues.extractValue(
+                "deployment_stats.allocation_status.allocation_count",
+                innerStats.get(0)
+            );
+            assertThat(statsMap.toString(), innerCount, is(0));
+        }, 30, TimeUnit.SECONDS);
+
+        var failures = new ConcurrentLinkedDeque<Exception>();
+
+        // infer will scale up
+        int inferenceCount = 10;
+        var latch = new CountDownLatch(inferenceCount);
+        for (int i = 0; i < inferenceCount; i++) {
+            asyncInfer("Auto scale and infer", modelId, TimeValue.timeValueSeconds(5), new ResponseListener() {
+                @Override
+                public void onSuccess(Response response) {
+                    latch.countDown();
+                }
+
+                @Override
+                public void onFailure(Exception exception) {
+                    latch.countDown();
+                    failures.add(exception);
+                }
+            });
+        }
+
+        latch.await();
+        assertThat(failures, empty());
+    }
+
+    @SuppressWarnings("unchecked")
+    public void testMultipleDeploymentsWaiting() throws Exception {
+        String id1 = "test_scale_from_zero_dep_1";
+        String id2 = "test_scale_from_zero_dep_2";
+        String id3 = "test_scale_from_zero_dep_3";
+        var idsList = Arrays.asList(id1, id2, id3);
+        for (var modelId : idsList) {
+            createPassThroughModel(modelId);
+            putModelDefinition(modelId, PyTorchModelIT.BASE_64_ENCODED_MODEL, PyTorchModelIT.RAW_MODEL_SIZE);
+            putVocabulary(List.of("Auto", "scale", "and", "infer"), modelId);
+
+            startDeployment(modelId, modelId, new AdaptiveAllocationsSettings(true, 0, 1));
+        }
+
+        // wait for scale down. The scaler service will check every 10 seconds
+        assertBusy(() -> {
+            var statsMap = entityAsMap(getTrainedModelStats("test_scale_from_zero_dep_*"));
+            List<Map<String, Object>> innerStats = (List<Map<String, Object>>) statsMap.get("trained_model_stats");
+            assertThat(innerStats, hasSize(3));
+            for (int i = 0; i < 3; i++) {
+                Integer innerCount = (Integer) XContentMapValues.extractValue(
+                    "deployment_stats.allocation_status.allocation_count",
+                    innerStats.get(i)
+                );
+                assertThat(statsMap.toString(), innerCount, is(0));
+            }
+        }, 30, TimeUnit.SECONDS);
+
+        // infer will scale up
+        int inferenceCount = 10;
+        var latch = new CountDownLatch(inferenceCount);
+        for (int i = 0; i < inferenceCount; i++) {
+            asyncInfer("Auto scale and infer", randomFrom(idsList), TimeValue.timeValueSeconds(5), new ResponseListener() {
+                @Override
+                public void onSuccess(Response response) {
+                    latch.countDown();
+                }
+
+                @Override
+                public void onFailure(Exception exception) {
+                    latch.countDown();
+                    fail(exception.getMessage());
+                }
+            });
+        }
+
+        latch.await();
+    }
+}
@@ -10,13 +10,17 @@
 import org.apache.http.util.EntityUtils;
 import org.elasticsearch.client.Request;
 import org.elasticsearch.client.Response;
+import org.elasticsearch.client.ResponseListener;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ThreadContext;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
-import org.elasticsearch.core.Strings;
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.test.SecuritySettingsSourceField;
 import org.elasticsearch.test.rest.ESRestTestCase;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.json.JsonXContent;
+import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
 import org.elasticsearch.xpack.core.ml.inference.assignment.AllocationStatus;
 import org.elasticsearch.xpack.core.ml.inference.assignment.Priority;
 import org.elasticsearch.xpack.core.ml.integration.MlRestTestStateCleaner;
@@ -282,6 +286,27 @@ protected Response startDeployment(
         return client().performRequest(request);
     }
 
+    protected Response startDeployment(String modelId, String deploymentId, AdaptiveAllocationsSettings adaptiveAllocationsSettings)
+        throws IOException {
+        String endPoint = "/_ml/trained_models/"
+            + modelId
+            + "/deployment/_start"
+            + "?deployment_id="
+            + deploymentId
+            + "&threads_per_allocation=1"
+            + "&wait_for=started";
+
+        XContentBuilder builder = JsonXContent.contentBuilder();
+        builder.startObject();
+        builder.field("adaptive_allocations", adaptiveAllocationsSettings);
+        builder.endObject();
+        var body = Strings.toString(builder);
+
+        Request request = new Request("POST", endPoint);
+        request.setJsonEntity(body);
+        return client().performRequest(request);
+    }
+
     protected void stopDeployment(String modelId) throws IOException {
         stopDeployment(modelId, false, false);
     }
@@ -325,6 +350,14 @@ protected Response infer(String input, String modelId, TimeValue timeout) throws
         return client().performRequest(request);
     }
 
+    protected void asyncInfer(String input, String modelId, TimeValue timeout, ResponseListener responseListener) throws IOException {
+        Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/_infer?timeout=" + timeout.toString());
+        request.setJsonEntity(Strings.format("""
+            {  "docs": [{"input":"%s"}] }
+            """, input));
+        client().performRequestAsync(request, responseListener);
+    }
+
     protected Response infer(String input, String modelId) throws IOException {
         Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/_infer?timeout=30s");
         request.setJsonEntity(Strings.format("""
 
@@ -11,9 +11,11 @@
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.injection.guice.Inject;
 import org.elasticsearch.license.XPackLicenseState;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.core.ml.action.InferModelAction;
 import org.elasticsearch.xpack.ml.inference.adaptiveallocations.AdaptiveAllocationsScalerService;
+import org.elasticsearch.xpack.ml.inference.assignment.TrainedModelAssignmentService;
 import org.elasticsearch.xpack.ml.inference.loadingservice.ModelLoadingService;
 import org.elasticsearch.xpack.ml.inference.persistence.TrainedModelProvider;
 
@@ -27,7 +29,9 @@ public TransportExternalInferModelAction(
         ClusterService clusterService,
         XPackLicenseState licenseState,
         TrainedModelProvider trainedModelProvider,
-        AdaptiveAllocationsScalerService adaptiveAllocationsScalerService
+        AdaptiveAllocationsScalerService adaptiveAllocationsScalerService,
+        TrainedModelAssignmentService assignmentService,
+        ThreadPool threadPool
     ) {
         super(
             InferModelAction.EXTERNAL_NAME,
@@ -38,7 +42,9 @@ public TransportExternalInferModelAction(
             clusterService,
             licenseState,
             trainedModelProvider,
-            adaptiveAllocationsScalerService
+            adaptiveAllocationsScalerService,
+            assignmentService,
+            threadPool
         );
     }
 }