davidkyle
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java‎
Lines changed: 3 additions & 3 deletions b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignmentTests.java‎
Lines changed: 7 additions & 7 deletions b/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignmentTests.java‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 96 additions & 0 deletions b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 42 additions & 1 deletion b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 42 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 14 additions & 1 deletion b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportExternalInferModelAction.java‎
Lines changed: 5 additions & 2 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportExternalInferModelAction.java‎
Lines changed: 5 additions & 2 deletions
@@ -224,15 +224,15 @@ public boolean hasStartedRoutes() {
         return nodeRoutingTable.values().stream().anyMatch(routeInfo -> routeInfo.getState() == RoutingState.STARTED);
     }
 
-    public List<Tuple<String, Integer>> selectRandomStartedNodesWeighedOnAllocationsForNRequests(
+    public List<Tuple<String, Integer>> selectRandomNodesWeighedOnAllocationsForNRequestsAndState(
         int numberOfRequests,
-        RoutingState requiredState
+        RoutingState ... acceptableStates
     ) {
         List<String> nodeIds = new ArrayList<>(nodeRoutingTable.size());
         List<Integer> cumulativeAllocations = new ArrayList<>(nodeRoutingTable.size());
         int allocationSum = 0;
         for (Map.Entry<String, RoutingInfo> routingEntry : nodeRoutingTable.entrySet()) {
-            if (routingEntry.getValue().getState() == requiredState) {
+            if (routingEntry.getValue().getState().isAnyOf(acceptableStates)) {
                 nodeIds.add(routingEntry.getKey());
                 allocationSum += routingEntry.getValue().getCurrentAllocations();
                 cumulativeAllocations.add(allocationSum);
 
@@ -195,15 +195,15 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenNoS
         builder.addRoutingEntry("node-2", new RoutingInfo(1, 1, RoutingState.STOPPED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        assertThat(assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED).isEmpty(), is(true));
+        assertThat(assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(1, RoutingState.STARTED).isEmpty(), is(true));
     }
 
     public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenSingleStartedNode() {
         TrainedModelAssignment.Builder builder = TrainedModelAssignment.Builder.empty(randomTaskParams(5), null);
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(1, RoutingState.STARTED);
 
         assertThat(nodes, contains(new Tuple<>("node-1", 1)));
     }
@@ -213,7 +213,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenASh
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STOPPING);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(1, RoutingState.STOPPING);
 
         assertThat(nodes, empty());
     }
@@ -223,7 +223,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenASh
         builder.addRoutingEntry("node-1", new RoutingInfo(4, 4, RoutingState.STOPPING, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STOPPING);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(1, RoutingState.STOPPING);
 
         assertThat(nodes, contains(new Tuple<>("node-1", 1)));
     }
@@ -234,7 +234,7 @@ public void testSingleRequestWith2Nodes() {
         builder.addRoutingEntry("node-2", new RoutingInfo(1, 1, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
 
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(1, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(1, RoutingState.STARTED);
         assertThat(nodes, hasSize(1));
         assertEquals(nodes.get(0).v2(), Integer.valueOf(1));
     }
@@ -248,7 +248,7 @@ public void testSelectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenMul
 
         final int selectionCount = 10000;
         final CountAccumulator countsPerNodeAccumulator = new CountAccumulator();
-        var nodes = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(selectionCount, RoutingState.STARTED);
+        var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(selectionCount, RoutingState.STARTED);
 
         assertThat(nodes, hasSize(3));
         assertThat(nodes.stream().mapToInt(Tuple::v2).sum(), equalTo(selectionCount));
@@ -269,7 +269,7 @@ public void testselectRandomStartedNodeWeighedOnAllocationsForNRequests_GivenMul
         builder.addRoutingEntry("node-3", new RoutingInfo(0, 0, RoutingState.STARTED, ""));
         TrainedModelAssignment assignment = builder.build();
         final int selectionCount = 1000;
-        var nodeCounts = assignment.selectRandomStartedNodesWeighedOnAllocationsForNRequests(selectionCount, RoutingState.STARTED);
+        var nodeCounts = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(selectionCount, RoutingState.STARTED);
         assertThat(nodeCounts, hasSize(3));
 
         var selectedNodes = new HashSet<String>();
 
@@ -0,0 +1,96 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.integration;
+
+import org.elasticsearch.client.Request;
+import org.elasticsearch.client.Response;
+import org.elasticsearch.client.ResponseListener;
+import org.elasticsearch.common.xcontent.support.XContentMapValues;
+import org.elasticsearch.core.TimeValue;
+import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
+import org.junit.Before;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.nullValue;
+
+public class AdaptiveAllocationsScaleFromZeroIT extends PyTorchModelRestTestCase {
+
+    @Before
+    public void setShortScaleToZeroPeriod() throws IOException {
+        logger.info("setting time");
+        Request scaleToZeroTime = new Request("PUT", "_cluster/settings");
+        scaleToZeroTime.setJsonEntity("""
+            {
+              "persistent": {
+                "xpack.ml.adaptive_allocations_scale_to_zero": "2s"
+              }
+            }""");
+
+        client().performRequest(scaleToZeroTime);
+    }
+
+    @SuppressWarnings("unchecked")
+    public void testScaleFromZero() throws Exception {
+        String modelId = "test_scale_from_zero";
+        createPassThroughModel(modelId);
+        putModelDefinition(modelId, PyTorchModelIT.BASE_64_ENCODED_MODEL, PyTorchModelIT.RAW_MODEL_SIZE);
+        putVocabulary(List.of("Auto", "scale", "and", "infer"), modelId);
+
+        startDeployment(modelId, modelId, new AdaptiveAllocationsSettings(true, 0, 1));
+
+        var responseMap = entityAsMap(getTrainedModelStats(modelId));
+        List<Map<String, Object>> stats = (List<Map<String, Object>>) responseMap.get("trained_model_stats");
+        String statusState = (String) XContentMapValues.extractValue("deployment_stats.allocation_status.state", stats.get(0));
+        assertThat(responseMap.toString(), statusState, is(not(nullValue())));
+        Integer count = (Integer) XContentMapValues.extractValue("deployment_stats.allocation_status.allocation_count", stats.get(0));
+        assertThat(responseMap.toString(), count, is(1));
+
+        // wait for scale down. The scaler service will check every 10 seconds
+        assertBusy(() -> {
+            var statsMap = entityAsMap(getTrainedModelStats(modelId));
+            List<Map<String, Object>> innerStats = (List<Map<String, Object>>) statsMap.get("trained_model_stats");
+            Integer innerCount = (Integer) XContentMapValues.extractValue(
+                "deployment_stats.allocation_status.allocation_count",
+                innerStats.get(0)
+            );
+            assertThat(statsMap.toString(), innerCount, is(0));
+        }, 30, TimeUnit.SECONDS);
+
+        // infer will scale up
+        int inferenceCount = 10;
+        var latch = new CountDownLatch(inferenceCount);
+        for (int i = 0; i < inferenceCount; i++) {
+            asyncInfer("Auto scale and infer", modelId, TimeValue.timeValueSeconds(5), new ResponseListener() {
+                @Override
+                public void onSuccess(Response response) {
+                    latch.countDown();
+                }
+
+                @Override
+                public void onFailure(Exception exception) {
+                    latch.countDown();
+                    fail(exception.getMessage());
+                }
+            });
+        }
+
+        latch.await();
+    }
+
+    // public void testMultipleDeploymentsWaiting() {
+    //
+    // }
+
+}
@@ -10,13 +10,20 @@
 import org.apache.http.util.EntityUtils;
 import org.elasticsearch.client.Request;
 import org.elasticsearch.client.Response;
+import org.elasticsearch.client.ResponseListener;
+import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.collect.Iterators;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ThreadContext;
+import org.elasticsearch.common.xcontent.ChunkedToXContentHelper;
+import org.elasticsearch.common.xcontent.ChunkedToXContentObject;
 import org.elasticsearch.common.xcontent.support.XContentMapValues;
-import org.elasticsearch.core.Strings;
 import org.elasticsearch.core.TimeValue;
 import org.elasticsearch.test.SecuritySettingsSourceField;
 import org.elasticsearch.test.rest.ESRestTestCase;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.json.JsonXContent;
+import org.elasticsearch.xpack.core.ml.inference.assignment.AdaptiveAllocationsSettings;
 import org.elasticsearch.xpack.core.ml.inference.assignment.AllocationStatus;
 import org.elasticsearch.xpack.core.ml.inference.assignment.Priority;
 import org.elasticsearch.xpack.core.ml.integration.MlRestTestStateCleaner;
@@ -282,6 +289,32 @@ protected Response startDeployment(
         return client().performRequest(request);
     }
 
+    protected Response startDeployment(String modelId, String deploymentId, AdaptiveAllocationsSettings adaptiveAllocationsSettings)
+        throws IOException {
+        String endPoint = "/_ml/trained_models/"
+            + modelId
+            + "/deployment/_start"
+            + "?deployment_id="
+            + deploymentId
+            + "&threads_per_allocation=1";
+
+        ChunkedToXContentObject innerChunkedContent = params -> Iterators.concat(
+            ChunkedToXContentHelper.startObject(),
+            Iterators.single(((builder, p2) -> builder.field("adaptive_allocations", adaptiveAllocationsSettings))),
+            ChunkedToXContentHelper.endObject()
+        );
+
+        XContentBuilder builder = JsonXContent.contentBuilder();
+        builder.startObject();
+        builder.field("adaptive_allocations", adaptiveAllocationsSettings);
+        builder.endObject();
+        var body = Strings.toString(builder);
+
+        Request request = new Request("POST", endPoint);
+        request.setJsonEntity(body);
+        return client().performRequest(request);
+    }
+
     protected void stopDeployment(String modelId) throws IOException {
         stopDeployment(modelId, false, false);
     }
@@ -325,6 +358,14 @@ protected Response infer(String input, String modelId, TimeValue timeout) throws
         return client().performRequest(request);
     }
 
+    protected void asyncInfer(String input, String modelId, TimeValue timeout, ResponseListener responseListener) throws IOException {
+        Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/_infer?timeout=" + timeout.toString());
+        request.setJsonEntity(Strings.format("""
+            {  "docs": [{"input":"%s"}] }
+            """, input));
+        client().performRequestAsync(request, responseListener);
+    }
+
     protected Response infer(String input, String modelId) throws IOException {
         Request request = new Request("POST", "/_ml/trained_models/" + modelId + "/_infer?timeout=30s");
         request.setJsonEntity(Strings.format("""
 
@@ -758,6 +758,18 @@ public void loadExtensions(ExtensionLoader loader) {
      */
     public static final int MAX_LOW_PRIORITY_MODELS_PER_NODE = 100;
 
+    /**
+     * The time interval without any requests that has to pass, before scaling down
+     * to zero allocations.
+     */
+    public static final Setting<TimeValue> ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_TIME = Setting.timeSetting(
+        "xpack.ml.adaptive_allocations_scale_to_zero",
+        TimeValue.timeValueMinutes(15),
+        TimeValue.timeValueSeconds(1),
+        Property.Dynamic,
+        Setting.Property.NodeScope
+    );
+
     private static final Logger logger = LogManager.getLogger(MachineLearning.class);
     private static final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(MachineLearning.class);
 
@@ -817,7 +829,8 @@ public List<Setting<?>> getSettings() {
             MAX_ML_NODE_SIZE,
             DELAYED_DATA_CHECK_FREQ,
             DUMMY_ENTITY_MEMORY,
-            DUMMY_ENTITY_PROCESSORS
+            DUMMY_ENTITY_PROCESSORS,
+            ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_TIME
         );
     }
 
 
@@ -11,6 +11,7 @@
 import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.injection.guice.Inject;
 import org.elasticsearch.license.XPackLicenseState;
+import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.transport.TransportService;
 import org.elasticsearch.xpack.core.ml.action.InferModelAction;
 import org.elasticsearch.xpack.ml.inference.adaptiveallocations.AdaptiveAllocationsScalerService;
@@ -29,7 +30,8 @@ public TransportExternalInferModelAction(
         XPackLicenseState licenseState,
         TrainedModelProvider trainedModelProvider,
         AdaptiveAllocationsScalerService adaptiveAllocationsScalerService,
-        TrainedModelAssignmentService assignmentService
+        TrainedModelAssignmentService assignmentService,
+        ThreadPool threadPool
     ) {
         super(
             InferModelAction.EXTERNAL_NAME,
@@ -41,7 +43,8 @@ public TransportExternalInferModelAction(
             licenseState,
             trainedModelProvider,
             adaptiveAllocationsScalerService,
-            assignmentService
+            assignmentService,
+            threadPool
         );
     }
 }