elastic
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 26 additions & 10 deletions b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java‎
Lines changed: 26 additions & 10 deletions
diff --git a/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 2 additions & 1 deletion b/‎x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/PyTorchModelRestTestCase.java‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 3 additions & 3 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/MachineLearning.java‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java‎
Lines changed: 7 additions & 5 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/InferenceWaitForAllocation.java‎
Lines changed: 197 additions & 0 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/InferenceWaitForAllocation.java‎
Lines changed: 197 additions & 0 deletions
@@ -19,6 +19,7 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.ConcurrentLinkedDeque;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 
@@ -36,7 +37,7 @@ public void setShortScaleToZeroPeriod() throws IOException {
         scaleToZeroTime.setJsonEntity("""
             {
               "persistent": {
-                "xpack.ml.adaptive_allocations_scale_to_zero": "2s"
+                "xpack.ml.adaptive_allocations_scale_to_zero_interval": "2s"
               }
             }""");
 
@@ -51,13 +52,14 @@ public void testScaleFromZero() throws Exception {
         putVocabulary(List.of("Auto", "scale", "and", "infer"), modelId);
 
         startDeployment(modelId, modelId, new AdaptiveAllocationsSettings(true, 0, 1));
-
-        var responseMap = entityAsMap(getTrainedModelStats(modelId));
-        List<Map<String, Object>> stats = (List<Map<String, Object>>) responseMap.get("trained_model_stats");
-        String statusState = (String) XContentMapValues.extractValue("deployment_stats.allocation_status.state", stats.get(0));
-        assertThat(responseMap.toString(), statusState, is(not(nullValue())));
-        Integer count = (Integer) XContentMapValues.extractValue("deployment_stats.allocation_status.allocation_count", stats.get(0));
-        assertThat(responseMap.toString(), count, is(1));
+        {
+            var responseMap = entityAsMap(getTrainedModelStats(modelId));
+            List<Map<String, Object>> stats = (List<Map<String, Object>>) responseMap.get("trained_model_stats");
+            String statusState = (String) XContentMapValues.extractValue("deployment_stats.allocation_status.state", stats.get(0));
+            assertThat(responseMap.toString(), statusState, is(not(nullValue())));
+            Integer count = (Integer) XContentMapValues.extractValue("deployment_stats.allocation_status.allocation_count", stats.get(0));
+            assertThat(responseMap.toString(), count, is(1));
+        }
 
         // wait for scale down. The scaler service will check every 10 seconds
         assertBusy(() -> {
@@ -70,8 +72,10 @@ public void testScaleFromZero() throws Exception {
             assertThat(statsMap.toString(), innerCount, is(0));
         }, 30, TimeUnit.SECONDS);
 
+        var failures = new ConcurrentLinkedDeque<Exception>();
+
         // infer will scale up
-        int inferenceCount = 100;
+        int inferenceCount = 10;
         var latch = new CountDownLatch(inferenceCount);
         for (int i = 0; i < inferenceCount; i++) {
             asyncInfer("Auto scale and infer", modelId, TimeValue.timeValueSeconds(5), new ResponseListener() {
@@ -83,12 +87,24 @@ public void onSuccess(Response response) {
                 @Override
                 public void onFailure(Exception exception) {
                     latch.countDown();
-                    fail(exception.getMessage());
+                    failures.add(exception);
                 }
             });
         }
 
         latch.await();
+        if (failures.isEmpty() == false) {
+            fail(failures.getFirst());
+        }
+
+        // {
+        // var responseMap = entityAsMap(getTrainedModelStats(modelId));
+        // List<Map<String, Object>> stats = (List<Map<String, Object>>) responseMap.get("trained_model_stats");
+        // String statusState = (String) XContentMapValues.extractValue("deployment_stats.allocation_status.state", stats.get(0));
+        // assertThat(responseMap.toString(), statusState, is(not(nullValue())));
+        // Integer count = (Integer) XContentMapValues.extractValue("deployment_stats.allocation_status.allocation_count", stats.get(0));
+        // assertThat(responseMap.toString(), count, greaterThan(0));
+        // }
     }
 
     @SuppressWarnings("unchecked")
 
@@ -296,7 +296,8 @@ protected Response startDeployment(String modelId, String deploymentId, Adaptive
             + "/deployment/_start"
             + "?deployment_id="
             + deploymentId
-            + "&threads_per_allocation=1";
+            + "&threads_per_allocation=1"
+            + "&wait_for=started";
 
         ChunkedToXContentObject innerChunkedContent = params -> Iterators.concat(
             ChunkedToXContentHelper.startObject(),
 
@@ -770,8 +770,8 @@ public void loadExtensions(ExtensionLoader loader) {
      * The time interval without any requests that has to pass, before scaling down
      * to zero allocations.
      */
-    public static final Setting<TimeValue> ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_TIME = Setting.timeSetting(
-        "xpack.ml.adaptive_allocations_scale_to_zero",
+    public static final Setting<TimeValue> ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_INTERVAL = Setting.timeSetting(
+        "xpack.ml.adaptive_allocations_scale_to_zero_interval",
         TimeValue.timeValueMinutes(15),
         TimeValue.timeValueSeconds(1),
         Property.Dynamic,
@@ -838,7 +838,7 @@ public List<Setting<?>> getSettings() {
             DELAYED_DATA_CHECK_FREQ,
             DUMMY_ENTITY_MEMORY,
             DUMMY_ENTITY_PROCESSORS,
-            ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_TIME
+            ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_INTERVAL
         );
     }
 
 
@@ -43,12 +43,12 @@
 import org.elasticsearch.xpack.core.ml.inference.results.ErrorInferenceResults;
 import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
 import org.elasticsearch.xpack.ml.MachineLearning;
+import org.elasticsearch.xpack.ml.inference.InferenceWaitForAllocation;
 import org.elasticsearch.xpack.ml.inference.adaptiveallocations.AdaptiveAllocationsScalerService;
 import org.elasticsearch.xpack.ml.inference.assignment.TrainedModelAssignmentService;
 import org.elasticsearch.xpack.ml.inference.loadingservice.LocalModel;
 import org.elasticsearch.xpack.ml.inference.loadingservice.ModelLoadingService;
 import org.elasticsearch.xpack.ml.inference.persistence.TrainedModelProvider;
-import org.elasticsearch.xpack.ml.inference.waitforallocations.ScalingInference;
 import org.elasticsearch.xpack.ml.utils.TypedChainTaskExecutor;
 
 import java.util.Collections;
@@ -71,7 +71,7 @@ public class TransportInternalInferModelAction extends HandledTransportAction<Re
     private final XPackLicenseState licenseState;
     private final TrainedModelProvider trainedModelProvider;
     private final AdaptiveAllocationsScalerService adaptiveAllocationsScalerService;
-    private final ScalingInference scalingInference;
+    private final InferenceWaitForAllocation scalingInference;
     private final ThreadPool threadPool;
 
     TransportInternalInferModelAction(
@@ -94,7 +94,7 @@ public class TransportInternalInferModelAction extends HandledTransportAction<Re
         this.licenseState = licenseState;
         this.trainedModelProvider = trainedModelProvider;
         this.adaptiveAllocationsScalerService = adaptiveAllocationsScalerService;
-        this.scalingInference = new ScalingInference(assignmentService, this::inferOnBlockedRequest);
+        this.scalingInference = new InferenceWaitForAllocation(assignmentService, this::inferOnBlockedRequest);
         this.threadPool = threadPool;
     }
 
@@ -280,7 +280,9 @@ private void inferAgainstAllocatedModel(
             if (starting) {
                 message += "; starting deployment of one allocation";
                 logger.info(message);
-                scalingInference.waitForAssignment(new ScalingInference.WaitingRequest(request, responseBuilder, parentTaskId, listener));
+                scalingInference.waitForAssignment(
+                    new InferenceWaitForAllocation.WaitingRequest(request, responseBuilder, parentTaskId, listener)
+                );
                 return;
             }
 
@@ -293,7 +295,7 @@ private void inferAgainstAllocatedModel(
             : "mismatch; sum of node requests does not match number of documents in request";
     }
 
-    private void inferOnBlockedRequest(ScalingInference.WaitingRequest request, TrainedModelAssignment assignment) {
+    private void inferOnBlockedRequest(InferenceWaitForAllocation.WaitingRequest request, TrainedModelAssignment assignment) {
         threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME).execute(() -> {
 
             var nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(
 
@@ -0,0 +1,197 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.ml.inference;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.ElasticsearchStatusException;
+import org.elasticsearch.action.ActionListener;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.rest.RestStatus;
+import org.elasticsearch.tasks.TaskId;
+import org.elasticsearch.xpack.core.ml.action.InferModelAction;
+import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingInfo;
+import org.elasticsearch.xpack.core.ml.inference.assignment.RoutingState;
+import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignment;
+import org.elasticsearch.xpack.core.ml.inference.assignment.TrainedModelAssignmentMetadata;
+import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper;
+import org.elasticsearch.xpack.ml.inference.assignment.TrainedModelAssignmentService;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.function.BiConsumer;
+import java.util.function.Predicate;
+
+import static org.elasticsearch.core.Strings.format;
+
+/**
+ * Class for storing inference requests for ml trained models while
+ * scaling is in progress. Once the trained model has at least 1
+ * allocation the stored requests are forwarded to a consumer for
+ * processing.Requests will timeout while waiting for scale.
+ */
+public class InferenceWaitForAllocation {
+
+    public static final int MAX_PENDING_REQUEST_COUNT = 100;
+
+    /**
+     * Track details of the pending request
+     */
+    public record WaitingRequest(
+        InferModelAction.Request request,
+        InferModelAction.Response.Builder responseBuilder,
+        TaskId parentTaskId,
+        ActionListener<InferModelAction.Response> listener
+    ) {
+        public String deploymentId() {
+            return request.getId();
+        }
+    }
+
+    private static final Logger logger = LogManager.getLogger(InferenceWaitForAllocation.class);
+
+    private final TrainedModelAssignmentService assignmentService;
+    private final BiConsumer<WaitingRequest, TrainedModelAssignment> queuedConsumer;
+    private AtomicInteger pendingRequestCount = new AtomicInteger();
+
+    /**
+     * Create with consumer of the successful requests
+     * @param assignmentService            Trained model assignment service
+     * @param onInferenceScaledConsumer    The consumer of the waiting request called once an
+     *                                     allocation is available.
+     */
+    public InferenceWaitForAllocation(
+        TrainedModelAssignmentService assignmentService,
+        BiConsumer<WaitingRequest, TrainedModelAssignment> onInferenceScaledConsumer
+    ) {
+        this.assignmentService = assignmentService;
+        this.queuedConsumer = onInferenceScaledConsumer;
+    }
+
+    /**
+     * Wait for at least 1 allocation to be started then process the
+     * inference request.
+     * If the pending request count is greater than {@link #MAX_PENDING_REQUEST_COUNT}
+     * the request listener is failed with a too many requests exception
+     * The timeout is the inference request timeout.
+     * @param request The inference request details
+     */
+    public synchronized void waitForAssignment(WaitingRequest request) {
+        logger.info("waitForAssignment will wait for condition");
+        if (pendingRequestCount.get() > MAX_PENDING_REQUEST_COUNT) {
+            request.listener.onFailure(
+                new ElasticsearchStatusException(
+                    "Rejected inference request waiting for an allocation of deployment [{}]. Too many pending requests",
+                    RestStatus.TOO_MANY_REQUESTS,
+                    request.request.getId()
+                )
+            );
+            return;
+        }
+
+        pendingRequestCount.incrementAndGet();
+        var prediate = new DeploymentHasAtLeastOneAllocation(request.deploymentId());
+
+        assignmentService.waitForAssignmentCondition(
+            request.deploymentId(),
+            prediate,
+            request.request().getInferenceTimeout(),
+            new WaitingListener(request.deploymentId(), request, prediate)
+        );
+    }
+
+    private static class DeploymentHasAtLeastOneAllocation implements Predicate<ClusterState> {
+
+        private final String deploymentId;
+        private AtomicReference<Exception> exception = new AtomicReference<>();
+
+        DeploymentHasAtLeastOneAllocation(String deploymentId) {
+            this.deploymentId = ExceptionsHelper.requireNonNull(deploymentId, "deployment_id");
+        }
+
+        @Override
+        public boolean test(ClusterState clusterState) {
+            logger.info("predicate test");
+            TrainedModelAssignment trainedModelAssignment = TrainedModelAssignmentMetadata.assignmentForDeploymentId(
+                clusterState,
+                deploymentId
+            ).orElse(null);
+            if (trainedModelAssignment == null) {
+                logger.info(() -> format("[%s] assignment was null while waiting to scale up", deploymentId));
+                return false;
+            }
+
+            Map<String, String> nodeFailuresAndReasons = new HashMap<>();
+            for (var nodeIdAndRouting : trainedModelAssignment.getNodeRoutingTable().entrySet()) {
+                if (RoutingState.FAILED.equals(nodeIdAndRouting.getValue().getState())) {
+                    nodeFailuresAndReasons.put(nodeIdAndRouting.getKey(), nodeIdAndRouting.getValue().getReason());
+                }
+            }
+            if (nodeFailuresAndReasons.isEmpty() == false) {
+                if (nodeFailuresAndReasons.size() == trainedModelAssignment.getNodeRoutingTable().size()) {
+                    exception.set(
+                        new ElasticsearchStatusException(
+                            "[{}] Error waiting for a model allocation, all nodes have failed with errors [{}]",
+                            RestStatus.INTERNAL_SERVER_ERROR,
+                            trainedModelAssignment.getDeploymentId(),
+                            nodeFailuresAndReasons
+                        )
+                    );
+                    return true; // don't try again
+                } else {
+                    logger.warn("Deployment [{}] has failed routes [{}]", trainedModelAssignment.getDeploymentId(), nodeFailuresAndReasons);
+                }
+            }
+
+            var routable = trainedModelAssignment.getNodeRoutingTable().values().stream().filter(RoutingInfo::isRoutable).findFirst();
+            if (routable.isPresent()) {
+                logger.info("first route " + routable.get() + ", state" + trainedModelAssignment.calculateAllocationStatus());
+            } else {
+                logger.info("no routes");
+            }
+
+            return routable.isPresent();
+        }
+    }
+
+    private class WaitingListener implements TrainedModelAssignmentService.WaitForAssignmentListener {
+
+        private final String deploymentId;
+        private final WaitingRequest request;
+        private final DeploymentHasAtLeastOneAllocation predicate;
+
+        private WaitingListener(String deploymentId, WaitingRequest request, DeploymentHasAtLeastOneAllocation predicate) {
+            this.deploymentId = deploymentId;
+            this.request = request;
+            this.predicate = predicate;
+        }
+
+        @Override
+        public void onResponse(TrainedModelAssignment assignment) {
+            // assignment is started, do inference
+            pendingRequestCount.decrementAndGet();
+
+            if (predicate.exception.get() != null) {
+                onFailure(predicate.exception.get());
+                return;
+            }
+
+            logger.info("sending waited request");
+            queuedConsumer.accept(request, assignment);
+        }
+
+        @Override
+        public void onFailure(Exception e) {
+            logger.info("failed waiting", e);
+            pendingRequestCount.decrementAndGet();
+            request.listener().onFailure(e);
+        }
+    }
+}