[ML] Avoid ModelAssignment deadlock (#109684)

prwhelan · elasticmachine · davidkyle · web-flow · commit 1aea04943c6c · 2024-07-05T09:54:51.000-04:00
The model loading scheduled thread iterates through the model
queue and deploys each model. Rather than block and wait on each
deployment, the thread will attach a listener that will either iterate
to the next model (if one is in the queue) or reschedule the thread.

This change should not impact:
1. the iterative nature of the model deployment process - each model is
   still deployed one at a time, and no additional threads are consumed
   per model.
2. the 1s delay between model deployment tries - if a deployment fails
   but can be retried, the retry is added to the next batch of models
   that are consumed after the 1s scheduled delay.

Co-authored-by: Elastic Machine &lt;elasticmachine@users.noreply.github.com&gt;
Co-authored-by: David Kyle &lt;david.kyle@elastic.co&gt;
diff --git a/docs/changelog/109684.yaml b/docs/changelog/109684.yaml
@@ -0,0 +1,5 @@
+pr: 109684
+summary: Avoid `ModelAssignment` deadlock
+area: Machine Learning
+type: bug
+issues: []
diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentNodeService.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentNodeService.java
@@ -12,8 +12,7 @@
 import org.elasticsearch.ResourceNotFoundException;
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.search.SearchPhaseExecutionException;
-import org.elasticsearch.action.support.PlainActionFuture;
-import org.elasticsearch.action.support.UnsafePlainActionFuture;
+import org.elasticsearch.action.support.SubscribableListener;
 import org.elasticsearch.action.support.master.AcknowledgedResponse;
 import org.elasticsearch.cluster.ClusterChangedEvent;
 import org.elasticsearch.cluster.ClusterState;
@@ -53,7 +52,6 @@
 import org.elasticsearch.xpack.ml.inference.deployment.TrainedModelDeploymentTask;
 import org.elasticsearch.xpack.ml.task.AbstractJobPersistentTasksExecutor;
 
-import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Deque;
@@ -154,26 +152,38 @@ public void beforeStop() {
         this.expressionResolver = expressionResolver;
     }
 
-    public void start() {
+    void start() {
         stopped = false;
-        scheduledFuture = threadPool.scheduleWithFixedDelay(
-            this::loadQueuedModels,
-            MODEL_LOADING_CHECK_INTERVAL,
-            threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME)
-        );
+        schedule(false);
     }
 
-    public void stop() {
+    private void schedule(boolean runImmediately) {
+        if (stopped) {
+            // do not schedule when stopped
+            return;
+        }
+
+        var rescheduleListener = ActionListener.wrap(this::schedule, e -> this.schedule(false));
+        Runnable loadQueuedModels = () -> loadQueuedModels(rescheduleListener);
+        var executor = threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME);
+
+        if (runImmediately) {
+            executor.execute(loadQueuedModels);
+        } else {
+            scheduledFuture = threadPool.schedule(loadQueuedModels, MODEL_LOADING_CHECK_INTERVAL, executor);
+        }
+    }
+
+    void stop() {
         stopped = true;
         ThreadPool.Cancellable cancellable = this.scheduledFuture;
         if (cancellable != null) {
             cancellable.cancel();
         }
     }
 
-    void loadQueuedModels() {
-        TrainedModelDeploymentTask loadingTask;
-        if (loadingModels.isEmpty()) {
+    void loadQueuedModels(ActionListener<Boolean> rescheduleImmediately) {
+        if (stopped) {
             return;
         }
         if (latestState != null) {
@@ -188,39 +198,49 @@ void loadQueuedModels() {
             );
             if (unassignedIndices.size() > 0) {
                 logger.trace("not loading models as indices {} primary shards are unassigned", unassignedIndices);
+                rescheduleImmediately.onResponse(false);
                 return;
             }
         }
-        logger.trace("attempting to load all currently queued models");
-        // NOTE: As soon as this method exits, the timer for the scheduler starts ticking
-        Deque<TrainedModelDeploymentTask> loadingToRetry = new ArrayDeque<>();
-        while ((loadingTask = loadingModels.poll()) != null) {
-            final String deploymentId = loadingTask.getDeploymentId();
-            if (loadingTask.isStopped()) {
-                if (logger.isTraceEnabled()) {
-                    String reason = loadingTask.stoppedReason().orElse("_unknown_");
-                    logger.trace("[{}] attempted to load stopped task with reason [{}]", deploymentId, reason);
-                }
-                continue;
+
+        var loadingTask = loadingModels.poll();
+        if (loadingTask == null) {
+            rescheduleImmediately.onResponse(false);
+            return;
+        }
+
+        loadModel(loadingTask, ActionListener.wrap(retry -> {
+            if (retry != null && retry) {
+                loadingModels.offer(loadingTask);
+                // don't reschedule immediately if the next task is the one we just queued, instead wait a bit to retry
+                rescheduleImmediately.onResponse(loadingModels.peek() != loadingTask);
+            } else {
+                rescheduleImmediately.onResponse(loadingModels.isEmpty() == false);
             }
-            if (stopped) {
-                return;
+        }, e -> rescheduleImmediately.onResponse(loadingModels.isEmpty() == false)));
+    }
+
+    void loadModel(TrainedModelDeploymentTask loadingTask, ActionListener<Boolean> retryListener) {
+        if (loadingTask.isStopped()) {
+            if (logger.isTraceEnabled()) {
+                logger.trace(
+                    "[{}] attempted to load stopped task with reason [{}]",
+                    loadingTask.getDeploymentId(),
+                    loadingTask.stoppedReason().orElse("_unknown_")
+                );
             }
-            final PlainActionFuture<TrainedModelDeploymentTask> listener = new UnsafePlainActionFuture<>(
-                MachineLearning.UTILITY_THREAD_POOL_NAME
-            );
-            try {
-                deploymentManager.startDeployment(loadingTask, listener);
-                // This needs to be synchronous here in the utility thread to keep queueing order
-                TrainedModelDeploymentTask deployedTask = listener.actionGet();
-                // kicks off asynchronous cluster state update
-                handleLoadSuccess(deployedTask);
-            } catch (Exception ex) {
+            retryListener.onResponse(false);
+            return;
+        }
+        SubscribableListener.<TrainedModelDeploymentTask>newForked(l -> deploymentManager.startDeployment(loadingTask, l))
+            .andThen(threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME), threadPool.getThreadContext(), this::handleLoadSuccess)
+            .addListener(retryListener.delegateResponse((retryL, ex) -> {
+                var deploymentId = loadingTask.getDeploymentId();
                 logger.warn(() -> "[" + deploymentId + "] Start deployment failed", ex);
                 if (ExceptionsHelper.unwrapCause(ex) instanceof ResourceNotFoundException) {
-                    String modelId = loadingTask.getParams().getModelId();
+                    var modelId = loadingTask.getParams().getModelId();
                     logger.debug(() -> "[" + deploymentId + "] Start deployment failed as model [" + modelId + "] was not found", ex);
-                    handleLoadFailure(loadingTask, ExceptionsHelper.missingTrainedModel(modelId, ex));
+                    handleLoadFailure(loadingTask, ExceptionsHelper.missingTrainedModel(modelId, ex), retryL);
                 } else if (ExceptionsHelper.unwrapCause(ex) instanceof SearchPhaseExecutionException) {
                     /*
                      * This case will not catch the ElasticsearchException generated from the ChunkedTrainedModelRestorer in a scenario
@@ -232,13 +252,11 @@ void loadQueuedModels() {
                     // A search phase execution failure should be retried, push task back to the queue
 
                     // This will cause the entire model to be reloaded (all the chunks)
-                    loadingToRetry.add(loadingTask);
+                    retryL.onResponse(true);
                 } else {
-                    handleLoadFailure(loadingTask, ex);
+                    handleLoadFailure(loadingTask, ex, retryL);
                 }
-            }
-        }
-        loadingModels.addAll(loadingToRetry);
+            }), threadPool.executor(MachineLearning.UTILITY_THREAD_POOL_NAME), threadPool.getThreadContext());
     }
 
     public void gracefullyStopDeploymentAndNotify(
@@ -680,14 +698,14 @@ void prepareModelToLoad(StartTrainedModelDeploymentAction.TaskParams taskParams)
         );
         // threadsafe check to verify we are not loading/loaded the model
         if (deploymentIdToTask.putIfAbsent(taskParams.getDeploymentId(), task) == null) {
-            loadingModels.add(task);
+            loadingModels.offer(task);
         } else {
             // If there is already a task for the deployment, unregister the new task
             taskManager.unregister(task);
         }
     }
 
-    private void handleLoadSuccess(TrainedModelDeploymentTask task) {
+    private void handleLoadSuccess(ActionListener<Boolean> retryListener, TrainedModelDeploymentTask task) {
         logger.debug(
             () -> "["
                 + task.getParams().getDeploymentId()
@@ -704,13 +722,16 @@ private void handleLoadSuccess(TrainedModelDeploymentTask task) {
                     task.stoppedReason().orElse("_unknown_")
                 )
             );
+            retryListener.onResponse(false);
             return;
         }
 
         updateStoredState(
             task.getDeploymentId(),
             RoutingInfoUpdate.updateStateAndReason(new RoutingStateAndReason(RoutingState.STARTED, "")),
-            ActionListener.wrap(r -> logger.debug(() -> "[" + task.getDeploymentId() + "] model loaded and accepting routes"), e -> {
+            ActionListener.runAfter(ActionListener.wrap(r -> {
+                logger.debug(() -> "[" + task.getDeploymentId() + "] model loaded and accepting routes");
+            }, e -> {
                 // This means that either the assignment has been deleted, or this node's particular route has been removed
                 if (ExceptionsHelper.unwrapCause(e) instanceof ResourceNotFoundException) {
                     logger.debug(
@@ -732,7 +753,7 @@ private void handleLoadSuccess(TrainedModelDeploymentTask task) {
                         e
                     );
                 }
-            })
+            }), () -> retryListener.onResponse(false))
         );
     }
 
@@ -752,7 +773,7 @@ private void updateStoredState(String deploymentId, RoutingInfoUpdate update, Ac
         );
     }
 
-    private void handleLoadFailure(TrainedModelDeploymentTask task, Exception ex) {
+    private void handleLoadFailure(TrainedModelDeploymentTask task, Exception ex, ActionListener<Boolean> retryListener) {
         logger.error(() -> "[" + task.getDeploymentId() + "] model [" + task.getParams().getModelId() + "] failed to load", ex);
         if (task.isStopped()) {
             logger.debug(
@@ -769,14 +790,14 @@ private void handleLoadFailure(TrainedModelDeploymentTask task, Exception ex) {
         Runnable stopTask = () -> stopDeploymentAsync(
             task,
             "model failed to load; reason [" + ex.getMessage() + "]",
-            ActionListener.noop()
+            ActionListener.running(() -> retryListener.onResponse(false))
         );
         updateStoredState(
             task.getDeploymentId(),
             RoutingInfoUpdate.updateStateAndReason(
                 new RoutingStateAndReason(RoutingState.FAILED, ExceptionsHelper.unwrapCause(ex).getMessage())
             ),
-            ActionListener.wrap(r -> stopTask.run(), e -> stopTask.run())
+            ActionListener.running(stopTask)
         );
     }
 
diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentNodeServiceTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/assignment/TrainedModelAssignmentNodeServiceTests.java