Fix actor slow startup times (#621)

hamersaw · web-flow · commit 57134f50d830 · 2025-01-21T15:23:02.000Z
## Overview Somewhere along the way a regression on actor performance occurred (probably during my large refactor). Cold-start times are extremely large as a result of reporting 0 capacity for the initial heartbeat. This means we need to wait for the next heartbeat for an updated capacity report to assign any tasks to the worker. The problem is that the executor startup was in a race condition with the heartbeat mechanism, so if the rust worker heartbeat before executors were initialized then the capacity was 0. This PR adds a future over a boolean to determine whether the runtime has initialized before starting the heartbeat runtime. ## Test Plan Ran this locally and decreased cold-start for "hello world" from 16s to 2s. Going to run some tests on cloud to make sure. ## Rollout Plan (if applicable) This may be rolled out immediately. We will need to cut a new `union` package. ## Upstream Changes Should this change be upstreamed to OSS (flyteorg/flyte)? If not, please uncheck this box, which is used for auditing. Note, it is the responsibility of each developer to actually upstream their changes. See [this guide](https://unionai.atlassian.net/wiki/spaces/ENG/pages/447610883/Flyte+-+Union+Cloud+Development+Runbook/#When-are-versions-updated%3F). - [ ] To be upstreamed to OSS ## Issue fixes https://linear.app/unionai/issue/COR-2673/fix-startup-way-slower-than-regular-tasks ## Checklist * [ ] Added tests * [ ] Ran a deploy dry run and shared the terraform plan * [ ] Added logging and metrics * [ ] Updated [dashboards](https://unionai.grafana.net/dashboards) and [alerts](https://unionai.grafana.net/alerting/list) * [ ] Updated documentation
diff --git a/fasttask/plugin/plugin.go b/fasttask/plugin/plugin.go
@@ -46,7 +46,7 @@ const maxErrorMessageLength = 102400 // 100kb
 var (
 	statusUpdateNotFoundError = errors.New("StatusUpdateNotFound")
 	taskContextNotFoundError  = errors.New("TaskContextNotFound")
-	podContainerNotFound      = errors.New("PodContainerNotFound")
+	podContainerNotFoundError = errors.New("PodContainerNotFound")
 
 	taskStartTimeTemplateVar       = tasklog.MustCreateRegex("taskStartTime")
 	taskStartTimeUnixMsTemplateVar = tasklog.MustCreateRegex("taskStartTimeUnixMs")
@@ -394,7 +394,7 @@ func (p *Plugin) trySubmitTask(ctx context.Context, tCtx core.TaskExecutionConte
 		pluginState.LastUpdated = now
 
 		taskInfo, err := p.getTaskInfo(ctx, tCtx, initialState.SubmittedAt, time.Now(), executionEnv, queueID, workerID)
-		if err != nil {
+		if err != nil && !errors.Is(err, podContainerNotFoundError) {
 			return nil, core.PhaseInfoUndefined, err
 		}
 		phaseInfo = core.PhaseInfoQueuedWithTaskInfo(now, pluginState.PhaseVersion, fmt.Sprintf("task offered to worker %s", workerID), taskInfo)
@@ -557,7 +557,7 @@ func (p *Plugin) getTaskInfo(ctx context.Context, tCtx core.TaskExecutionContext
 		// an in-memory store the may occur during restarts.
 		// `pod == nil` may occur if it has not yet been populated in the kubeclient cache or was deleted
 		logger.Warnf(ctx, "Worker %q not found (exists=%t) in status map for queue %q", workerID, ok, queueID)
-		return &taskInfo, podContainerNotFound
+		return &taskInfo, podContainerNotFoundError
 	}
 
 	containerIndex := -1
@@ -569,7 +569,7 @@ func (p *Plugin) getTaskInfo(ctx context.Context, tCtx core.TaskExecutionContext
 	}
 	if containerIndex == -1 {
 		logger.Warnf(ctx, "Container %q not found in pod %q", pod.GetName(), pod.GetName())
-		return &taskInfo, podContainerNotFound
+		return &taskInfo, podContainerNotFoundError
 	}
 
 	taskInfo.LogContext = &idlcore.LogContext{
@@ -594,7 +594,7 @@ func (p *Plugin) getTaskInfo(ctx context.Context, tCtx core.TaskExecutionContext
 
 	if len(pod.Status.ContainerStatuses) <= containerIndex || pod.Status.ContainerStatuses[containerIndex].ContainerID == "" {
 		// no container id yet
-		return &taskInfo, podContainerNotFound
+		return &taskInfo, podContainerNotFoundError
 	}
 
 	taskTemplate, err := tCtx.TaskReader().Read(ctx)
diff --git a/fasttask/plugin/plugin_test.go b/fasttask/plugin/plugin_test.go
@@ -665,7 +665,7 @@ func TestHandleRunning(t *testing.T) {
 			expectedPhase:          core.PhaseUndefined,
 			expectedPhaseVersion:   0,
 			expectedReason:         "",
-			expectedError:          podContainerNotFound,
+			expectedError:          podContainerNotFoundError,
 			expectedLastUpdatedInc: false,
 			expectedLogs:           false,
 		},
@@ -1002,7 +1002,7 @@ func TestGetTaskInfo(t *testing.T) {
 
 		taskInfo, err := plugin.getTaskInfo(ctx, tCtx, start, now, executionEnv, queueID, workerID)
 
-		assert.Equal(t, podContainerNotFound, err)
+		assert.Equal(t, podContainerNotFoundError, err)
 		assert.Empty(t, taskInfo.Logs)
 		assert.Nil(t, taskInfo.LogContext)
 	})
@@ -1050,7 +1050,7 @@ func TestGetTaskInfo(t *testing.T) {
 
 		taskInfo, err := plugin.getTaskInfo(ctx, tCtx, start, now, executionEnv, queueID, workerID)
 
-		assert.Equal(t, podContainerNotFound, err)
+		assert.Equal(t, podContainerNotFoundError, err)
 		assert.Empty(t, taskInfo.Logs)
 		assert.Nil(t, taskInfo.LogContext)
 	})
@@ -1098,7 +1098,7 @@ func TestGetTaskInfo(t *testing.T) {
 
 		taskInfo, err := plugin.getTaskInfo(ctx, tCtx, start, now, executionEnv, queueID, workerID)
 
-		assert.Equal(t, podContainerNotFound, err)
+		assert.Equal(t, podContainerNotFoundError, err)
 		assert.Empty(t, taskInfo.Logs)
 		assert.Equal(t, expectedLogCtx, taskInfo.LogContext)
 	})
diff --git a/fasttask/plugin/service.go b/fasttask/plugin/service.go
@@ -384,7 +384,7 @@ func (f *fastTaskServiceImpl) OfferOnQueue(ctx context.Context, queueID, taskID,
 	}
 
 	// create task status channel
-	f.taskStatusChannels.Store(taskID, make(chan *workerTaskStatus, GetConfig().TaskStatusBufferSize))
+	f.taskStatusChannels.LoadOrStore(taskID, make(chan *workerTaskStatus, GetConfig().TaskStatusBufferSize))
 	return worker.workerID, nil
 }
 
diff --git a/fasttask/worker/bridge/src/bridge.rs b/fasttask/worker/bridge/src/bridge.rs
@@ -1,5 +1,7 @@
+use std::sync::{Arc, Mutex};
 use std::time::Duration;
 
+use crate::common::{AsyncBool, AsyncBoolFuture};
 use crate::connection::{ConnectionBuilder, ConnectionRuntime};
 use crate::heartbeater::{HeartbeatRuntime, Heartbeater};
 use crate::manager::{CapacityReporter, TaskManager, TaskManagerRuntime};
@@ -19,16 +21,25 @@ pub async fn run<T: ConnectionBuilder, U: Heartbeater + Send, V: TaskManager>(
     let (task_status_tx, task_status_rx) = async_channel::unbounded();
 
     // initialize and start manager
+    let manager_runtime_ready = Arc::new(Mutex::new(AsyncBool::new()));
+    let manager_runtime_ready_clone = manager_runtime_ready.clone();
+
     let manager_runtime = manager.get_runtime()?; // TODO @hamersaw - handle error
     let _manager_handle = tokio::spawn(async move {
         // currently panicking if manager runtime fails rather than attempting to restart. this will
         // effectively force a new replica and failover tasks. a manager runtime failure should
         // only occur as a bug.
-        if let Err(e) = manager_runtime.run(task_status_tx).await {
+        if let Err(e) = manager_runtime
+            .run(manager_runtime_ready_clone, task_status_tx)
+            .await
+        {
             panic!("manager failed: {}", e);
         }
     });
 
+    let manager_runtime_future = AsyncBoolFuture::new(manager_runtime_ready);
+    manager_runtime_future.await;
+
     // start heartbeater
     let heartbeat_runtime = heartbeater.get_runtime()?; // TODO @hamersaw - handle error
     let _heartbeat_handle = tokio::spawn(async move {
diff --git a/fasttask/worker/bridge/src/common.rs b/fasttask/worker/bridge/src/common.rs
@@ -1,4 +1,8 @@
 use std::collections::HashMap;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll, Waker};
 
 use async_channel::Sender;
 use serde::{Deserialize, Serialize};
@@ -35,3 +39,51 @@ pub struct Response {
 
     pub executor_corrupt: bool,
 }
+
+pub struct AsyncBoolFuture {
+    async_bool: Arc<Mutex<AsyncBool>>,
+}
+
+impl AsyncBoolFuture {
+    pub fn new(async_bool: Arc<Mutex<AsyncBool>>) -> Self {
+        Self { async_bool }
+    }
+}
+
+impl Future for AsyncBoolFuture {
+    type Output = ();
+
+    fn poll(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<()> {
+        let mut async_bool = self.async_bool.lock().unwrap();
+        if async_bool.value {
+            async_bool.value = false;
+            return Poll::Ready(());
+        }
+
+        let waker = ctx.waker().clone();
+        async_bool.waker = Some(waker);
+
+        Poll::Pending
+    }
+}
+
+pub struct AsyncBool {
+    value: bool,
+    waker: Option<Waker>,
+}
+
+impl AsyncBool {
+    pub fn new() -> Self {
+        Self {
+            value: false,
+            waker: None,
+        }
+    }
+
+    pub fn trigger(&mut self) {
+        self.value = true;
+        if let Some(waker) = &self.waker {
+            waker.clone().wake();
+        }
+    }
+}
diff --git a/fasttask/worker/bridge/src/heartbeater.rs b/fasttask/worker/bridge/src/heartbeater.rs
@@ -1,14 +1,11 @@
-use std::future::Future;
-use std::pin::Pin;
 use std::sync::{Arc, Mutex, RwLock};
-use std::task::{Context, Poll, Waker};
 use std::time::Duration;
 
 use anyhow::Result;
 use async_channel::{Receiver, Sender};
 use tokio::time::Interval;
 
-use crate::common::{FAILED, SUCCEEDED};
+use crate::common::{AsyncBool, AsyncBoolFuture, FAILED, SUCCEEDED};
 use crate::manager::CapacityReporter;
 use crate::pb::fasttask::{HeartbeatRequest, TaskStatus};
 
@@ -114,59 +111,14 @@ impl HeartbeatRuntime for PeriodicHeartbeatRuntime {
     }
 }
 
-struct AsyncBoolFuture {
-    async_bool: Arc<Mutex<AsyncBool>>,
-}
-
-impl Future for AsyncBoolFuture {
-    type Output = ();
-
-    fn poll(self: Pin<&mut Self>, ctx: &mut Context<'_>) -> Poll<()> {
-        let mut async_bool = self.async_bool.lock().unwrap();
-        if async_bool.value {
-            async_bool.value = false;
-            return Poll::Ready(());
-        }
-
-        let waker = ctx.waker().clone();
-        async_bool.waker = Some(waker);
-
-        Poll::Pending
-    }
-}
-
-struct AsyncBool {
-    value: bool,
-    waker: Option<Waker>,
-}
-
-impl AsyncBool {
-    fn new() -> Self {
-        Self {
-            value: false,
-            waker: None,
-        }
-    }
-
-    fn trigger(&mut self) {
-        self.value = true;
-        if let Some(waker) = &self.waker {
-            waker.clone().wake();
-        }
-    }
-}
-
 struct HeartbeatTrigger {
     interval: Interval,
     async_bool: Arc<Mutex<AsyncBool>>,
 }
 
 impl HeartbeatTrigger {
     async fn trigger(&mut self) -> () {
-        let async_bool_future = AsyncBoolFuture {
-            async_bool: self.async_bool.clone(),
-        };
-
+        let async_bool_future = AsyncBoolFuture::new(self.async_bool.clone());
         tokio::select! {
             _ = self.interval.tick() => {},
             _ = async_bool_future => {},
diff --git a/fasttask/worker/bridge/src/manager.rs b/fasttask/worker/bridge/src/manager.rs
@@ -11,7 +11,7 @@ use tokio::process::Command;
 use tokio_util::codec::{Framed, LengthDelimitedCodec};
 use tracing::warn;
 
-use crate::common::{Executor, TaskContext, SUCCEEDED};
+use crate::common::{AsyncBool, Executor, TaskContext, SUCCEEDED};
 use crate::pb::fasttask::{Capacity, TaskStatus};
 use crate::task::{self};
 
@@ -32,7 +32,11 @@ pub trait TaskManager {
 
 #[trait_variant::make(TaskManagerRuntime: Send)]
 pub trait LocalTaskManagerRuntime {
-    async fn run(&self, task_status_tx: Sender<TaskStatus>) -> Result<()>;
+    async fn run(
+        &self,
+        ready: Arc<Mutex<AsyncBool>>,
+        task_status_tx: Sender<TaskStatus>,
+    ) -> Result<()>;
 }
 
 pub struct CapacityReporter {
@@ -316,7 +320,11 @@ pub struct MultiProcessRuntime {
 }
 
 impl TaskManagerRuntime for MultiProcessRuntime {
-    async fn run(&self, task_status_tx: Sender<TaskStatus>) -> Result<()> {
+    async fn run(
+        &self,
+        ready: Arc<Mutex<AsyncBool>>,
+        task_status_tx: Sender<TaskStatus>,
+    ) -> Result<()> {
         let (backlog_tx, backlog_rx) = (self.backlog_tx.clone(), self.backlog_rx.clone());
         let (executor_tx, executor_rx) = (self.executor_tx.clone(), self.executor_rx.clone());
         let (task_assignment_rx, task_contexts) =
@@ -349,6 +357,12 @@ impl TaskManagerRuntime for MultiProcessRuntime {
                     self.executor_tx.send(executor).await?;
 
                     index += 1;
+
+                    // trigger ready if all executors are initialized
+                    if index == self.parallelism {
+                        let mut ready = ready.lock().unwrap();
+                        ready.trigger();
+                    }
                 },
                 task_assignment_result = task_assignment_rx.recv() => {
                     let task_assignment= task_assignment_result?;
@@ -507,7 +521,16 @@ pub struct SuccessRuntime {
 }
 
 impl TaskManagerRuntime for SuccessRuntime {
-    async fn run(&self, task_status_tx: Sender<TaskStatus>) -> Result<()> {
+    async fn run(
+        &self,
+        ready: Arc<Mutex<AsyncBool>>,
+        task_status_tx: Sender<TaskStatus>,
+    ) -> Result<()> {
+        {
+            let mut ready = ready.lock().unwrap();
+            ready.trigger();
+        }
+
         let task_rx = self.task_rx.clone();
         loop {
             let task_result = task_rx.recv().await;
@@ -563,8 +586,9 @@ mod tests {
         assert!(manager_runtime_result.is_ok());
         let manager_runtime = manager_runtime_result.unwrap();
 
+        let ready = Arc::new(Mutex::new(AsyncBool::new()));
         let manager_handle = tokio::spawn(async move {
-            super::TaskManagerRuntime::run(&manager_runtime, task_status_tx).await
+            super::TaskManagerRuntime::run(&manager_runtime, ready, task_status_tx).await
         });
 
         // validate get capacity works

Original file line number	Diff line number	Diff line change
`@@ -384,7 +384,7 @@ func (f *fastTaskServiceImpl) OfferOnQueue(ctx context.Context, queueID, taskID,`
`384`	`384`	`}`
`385`	`385`
`386`	`386`	`// create task status channel`
`387`		`- f.taskStatusChannels.Store(taskID, make(chan *workerTaskStatus, GetConfig().TaskStatusBufferSize))`
	`387`	`+ f.taskStatusChannels.LoadOrStore(taskID, make(chan *workerTaskStatus, GetConfig().TaskStatusBufferSize))`
`388`	`388`	`return worker.workerID, nil`
`389`	`389`	`}`
`390`	`390`