protect against multiple requests

davidkyle · davidkyle · commit 7a566915e2ac · 2024-10-15T12:08:40.000+01:00
diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/inference/assignment/TrainedModelAssignment.java
@@ -226,7 +226,7 @@ public boolean hasStartedRoutes() {
 
     public List<Tuple<String, Integer>> selectRandomNodesWeighedOnAllocationsForNRequestsAndState(
         int numberOfRequests,
-        RoutingState ... acceptableStates
+        RoutingState... acceptableStates
     ) {
         List<String> nodeIds = new ArrayList<>(nodeRoutingTable.size());
         List<Integer> cumulativeAllocations = new ArrayList<>(nodeRoutingTable.size());
diff --git a/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java b/x-pack/plugin/ml/qa/native-multi-node-tests/src/javaRestTest/java/org/elasticsearch/xpack/ml/integration/AdaptiveAllocationsScaleFromZeroIT.java
@@ -16,11 +16,13 @@
 import org.junit.Before;
 
 import java.io.IOException;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
 
+import static org.hamcrest.Matchers.hasSize;
 import static org.hamcrest.Matchers.is;
 import static org.hamcrest.Matchers.not;
 import static org.hamcrest.Matchers.nullValue;
@@ -69,7 +71,7 @@ public void testScaleFromZero() throws Exception {
         }, 30, TimeUnit.SECONDS);
 
         // infer will scale up
-        int inferenceCount = 10;
+        int inferenceCount = 100;
         var latch = new CountDownLatch(inferenceCount);
         for (int i = 0; i < inferenceCount; i++) {
             asyncInfer("Auto scale and infer", modelId, TimeValue.timeValueSeconds(5), new ResponseListener() {
@@ -89,8 +91,52 @@ public void onFailure(Exception exception) {
         latch.await();
     }
 
-    // public void testMultipleDeploymentsWaiting() {
-    //
-    // }
+    @SuppressWarnings("unchecked")
+    public void testMultipleDeploymentsWaiting() throws Exception {
+        String id1 = "test_scale_from_zero_dep_1";
+        String id2 = "test_scale_from_zero_dep_2";
+        String id3 = "test_scale_from_zero_dep_3";
+        var idsList = Arrays.asList(id1, id2, id3);
+        for (var modelId : idsList) {
+            createPassThroughModel(modelId);
+            putModelDefinition(modelId, PyTorchModelIT.BASE_64_ENCODED_MODEL, PyTorchModelIT.RAW_MODEL_SIZE);
+            putVocabulary(List.of("Auto", "scale", "and", "infer"), modelId);
+
+            startDeployment(modelId, modelId, new AdaptiveAllocationsSettings(true, 0, 1));
+        }
+
+        // wait for scale down. The scaler service will check every 10 seconds
+        assertBusy(() -> {
+            var statsMap = entityAsMap(getTrainedModelStats("test_scale_from_zero_dep_*"));
+            List<Map<String, Object>> innerStats = (List<Map<String, Object>>) statsMap.get("trained_model_stats");
+            assertThat(innerStats, hasSize(3));
+            for (int i = 0; i < 3; i++) {
+                Integer innerCount = (Integer) XContentMapValues.extractValue(
+                    "deployment_stats.allocation_status.allocation_count",
+                    innerStats.get(i)
+                );
+                assertThat(statsMap.toString(), innerCount, is(0));
+            }
+        }, 30, TimeUnit.SECONDS);
 
+        // infer will scale up
+        int inferenceCount = 100;
+        var latch = new CountDownLatch(inferenceCount);
+        for (int i = 0; i < inferenceCount; i++) {
+            asyncInfer("Auto scale and infer", randomFrom(idsList), TimeValue.timeValueSeconds(5), new ResponseListener() {
+                @Override
+                public void onSuccess(Response response) {
+                    latch.countDown();
+                }
+
+                @Override
+                public void onFailure(Exception exception) {
+                    latch.countDown();
+                    fail(exception.getMessage());
+                }
+            });
+        }
+
+        latch.await();
+    }
 }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportInternalInferModelAction.java
@@ -268,7 +268,10 @@ private void inferAgainstAllocatedModel(
 
         // We couldn't find any nodes in the started state so let's look for ones that are stopping in case we're shutting down some nodes
         if (nodes.isEmpty()) {
-            nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(request.numberOfDocuments(), RoutingState.STOPPING);
+            nodes = assignment.selectRandomNodesWeighedOnAllocationsForNRequestsAndState(
+                request.numberOfDocuments(),
+                RoutingState.STOPPING
+            );
         }
 
         if (nodes.isEmpty()) {
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScaler.java
@@ -9,11 +9,7 @@
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
-import org.elasticsearch.cluster.service.ClusterService;
 import org.elasticsearch.common.Strings;
-import org.elasticsearch.core.TimeValue;
-
-import static org.elasticsearch.xpack.ml.MachineLearning.ADAPTIVE_ALLOCATIONS_SCALE_TO_ZERO_TIME;
 
 /**
  * Processes measured requests counts and inference times and decides whether
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerService.java
@@ -25,6 +25,7 @@
 import org.elasticsearch.threadpool.Scheduler;
 import org.elasticsearch.threadpool.ThreadPool;
 import org.elasticsearch.xpack.core.ClientHelper;
+import org.elasticsearch.xpack.core.ml.action.CreateTrainedModelAssignmentAction;
 import org.elasticsearch.xpack.core.ml.action.GetDeploymentStatsAction;
 import org.elasticsearch.xpack.core.ml.action.UpdateTrainedModelDeploymentAction;
 import org.elasticsearch.xpack.core.ml.inference.assignment.AssignmentStats;
@@ -40,6 +41,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentSkipListSet;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.function.Function;
@@ -206,6 +208,8 @@ Collection<DoubleWithAttributes> observeDouble(Function<AdaptiveAllocationsScale
 
     private final AtomicLong scaleToZeroAfterNoRequestsSeconds = new AtomicLong();
 
+    private final Set<String> inFlightScaleFromZeroRequests = new ConcurrentSkipListSet<>();
+
     public AdaptiveAllocationsScalerService(
         ThreadPool threadPool,
         ClusterService clusterService,
@@ -287,8 +291,11 @@ private synchronized void updateAutoscalers(ClusterState state) {
                 && assignment.getAdaptiveAllocationsSettings().getEnabled() == Boolean.TRUE) {
                 AdaptiveAllocationsScaler adaptiveAllocationsScaler = scalers.computeIfAbsent(
                     assignment.getDeploymentId(),
-                    key -> new AdaptiveAllocationsScaler(assignment.getDeploymentId(), assignment.totalTargetAllocations(),
-                        scaleToZeroAfterNoRequestsSeconds.get())
+                    key -> new AdaptiveAllocationsScaler(
+                        assignment.getDeploymentId(),
+                        assignment.totalTargetAllocations(),
+                        scaleToZeroAfterNoRequestsSeconds.get()
+                    )
                 );
                 adaptiveAllocationsScaler.setMinMaxNumberOfAllocations(
                     assignment.getAdaptiveAllocationsSettings().getMinNumberOfAllocations(),
@@ -416,22 +423,42 @@ private void processDeploymentStats(GetDeploymentStatsAction.Response statsRespo
                 if (newNumberOfAllocations > numberOfAllocations.get(deploymentId)) {
                     lastScaleUpTimesMillis.put(deploymentId, now);
                 }
-                updateNumberOfAllocations(deploymentId, newNumberOfAllocations);
+                updateNumberOfAllocations(
+                    deploymentId,
+                    newNumberOfAllocations,
+                    updateAssigmentListener(deploymentId, newNumberOfAllocations)
+                );
             }
         }
     }
 
     public boolean maybeStartAllocation(TrainedModelAssignment assignment) {
         if (assignment.getAdaptiveAllocationsSettings() != null
-            && assignment.getAdaptiveAllocationsSettings().getEnabled() == Boolean.TRUE) {
-            lastScaleUpTimesMillis.put(assignment.getDeploymentId(), System.currentTimeMillis());
-            updateNumberOfAllocations(assignment.getDeploymentId(), 1);
+            && assignment.getAdaptiveAllocationsSettings().getEnabled() == Boolean.TRUE
+            && assignment.getAdaptiveAllocationsSettings().getMinNumberOfAllocations() == 0) {
+
+            // Prevent against a flurry of scale up requests.
+            if (inFlightScaleFromZeroRequests.contains(assignment.getDeploymentId()) == false) {
+                lastScaleUpTimesMillis.put(assignment.getDeploymentId(), System.currentTimeMillis());
+                var updateListener = updateAssigmentListener(assignment.getDeploymentId(), 1);
+                var cleanUpListener = ActionListener.runAfter(
+                    updateListener,
+                    () -> inFlightScaleFromZeroRequests.remove(assignment.getDeploymentId())
+                );
+
+                inFlightScaleFromZeroRequests.add(assignment.getDeploymentId());
+                updateNumberOfAllocations(assignment.getDeploymentId(), 1, cleanUpListener);
+            }
             return true;
         }
         return false;
     }
 
-    private void updateNumberOfAllocations(String deploymentId, int numberOfAllocations) {
+    private void updateNumberOfAllocations(
+        String deploymentId,
+        int numberOfAllocations,
+        ActionListener<CreateTrainedModelAssignmentAction.Response> listener
+    ) {
         UpdateTrainedModelDeploymentAction.Request updateRequest = new UpdateTrainedModelDeploymentAction.Request(deploymentId);
         updateRequest.setNumberOfAllocations(numberOfAllocations);
         updateRequest.setIsInternal(true);
@@ -440,40 +467,43 @@ private void updateNumberOfAllocations(String deploymentId, int numberOfAllocati
             ClientHelper.ML_ORIGIN,
             UpdateTrainedModelDeploymentAction.INSTANCE,
             updateRequest,
-            ActionListener.wrap(updateResponse -> {
-                logger.info("adaptive allocations scaler: scaled [{}] to [{}] allocations.", deploymentId, numberOfAllocations);
-                threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME)
-                    .execute(
-                        () -> inferenceAuditor.info(
-                            deploymentId,
-                            Strings.format(
-                                "adaptive allocations scaler: scaled [%s] to [%s] allocations.",
-                                deploymentId,
-                                numberOfAllocations
-                            )
-                        )
-                    );
-            }, e -> {
-                logger.atLevel(Level.WARN)
-                    .withThrowable(e)
-                    .log("adaptive allocations scaler: scaling [{}] to [{}] allocations failed.", deploymentId, numberOfAllocations);
-                threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME)
-                    .execute(
-                        () -> inferenceAuditor.warning(
-                            deploymentId,
-                            Strings.format(
-                                "adaptive allocations scaler: scaling [%s] to [%s] allocations failed.",
-                                deploymentId,
-                                numberOfAllocations
-                            )
-                        )
-                    );
-            })
+            listener
         );
     }
 
     private void setScaleToZeroPeriod(TimeValue timeValue) {
         logger.info("setting scaler service to zero " + timeValue);
         scaleToZeroAfterNoRequestsSeconds.set(timeValue.seconds());
     }
+
+    private ActionListener<CreateTrainedModelAssignmentAction.Response> updateAssigmentListener(
+        String deploymentId,
+        int numberOfAllocations
+    ) {
+        return ActionListener.wrap(updateResponse -> {
+            logger.info("adaptive allocations scaler: scaled [{}] to [{}] allocations.", deploymentId, numberOfAllocations);
+            threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME)
+                .execute(
+                    () -> inferenceAuditor.info(
+                        deploymentId,
+                        Strings.format("adaptive allocations scaler: scaled [%s] to [%s] allocations.", deploymentId, numberOfAllocations)
+                    )
+                );
+        }, e -> {
+            logger.atLevel(Level.WARN)
+                .withThrowable(e)
+                .log("adaptive allocations scaler: scaling [{}] to [{}] allocations failed.", deploymentId, numberOfAllocations);
+            threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME)
+                .execute(
+                    () -> inferenceAuditor.warning(
+                        deploymentId,
+                        Strings.format(
+                            "adaptive allocations scaler: scaling [%s] to [%s] allocations failed.",
+                            deploymentId,
+                            numberOfAllocations
+                        )
+                    )
+                );
+        });
+    }
 }
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/waitforallocations/ScalingInference.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/waitforallocations/ScalingInference.java
@@ -11,7 +11,6 @@
 import org.apache.logging.log4j.Logger;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.common.Strings;
 import org.elasticsearch.tasks.TaskId;
 import org.elasticsearch.xpack.core.ml.action.InferModelAction;
 import org.elasticsearch.xpack.core.ml.inference.assignment.AllocationStatus;
@@ -58,11 +57,10 @@ public ScalingInference(
     }
 
     public synchronized void waitForAssignment(WaitingRequest request) {
-        logger.info("new wait for request");
         var p = queueRequests.computeIfAbsent(request.deploymentId(), k -> new LinkedBlockingQueue<>());
 
         if (p.isEmpty()) {
-            logger.info("will wait for condition");
+            logger.info("waitForAssignment will wait for condition");
             p.offer(request);
             assignmentService.waitForAssignmentCondition(
                 request.deploymentId(),
@@ -71,7 +69,7 @@ public synchronized void waitForAssignment(WaitingRequest request) {
                 new WaitingListener(request.deploymentId())
             );
         } else {
-            logger.info("added to queue");
+            logger.info("waitForAssignment added to queue");
             p.offer(request);
         }
 
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerServiceTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/adaptiveallocations/AdaptiveAllocationsScalerServiceTests.java