[SPARK-55974][CORE][YARN] Relaunch new executors if the executor launching take too long time

AngersZhuuuu · AngersZhuuuu · commit 51d427900b84 · 2026-03-12T17:27:28.000+08:00
diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationClient.scala
@@ -37,6 +37,13 @@ private[spark] trait ExecutorAllocationClient {
    */
   def isExecutorActive(id: String): Boolean
 
+  /**
+   * Hook for subclasses to be notified when an executor has registered with the driver.
+   * YarnSchedulerBackend overrides this to send ExecutorRegisteredWithDriver to the AM for
+   * executor launch timeout tracking.
+   */
+  def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit
+
   /**
    * Update the cluster manager on our scheduling needs. Three bits of information are included
    * to help it make decisions.
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedClusterMessage.scala
@@ -123,6 +123,11 @@ private[spark] object CoarseGrainedClusterMessages {
   // indicating the executor starts to decommission.
   object ExecutorDecommissionSigReceived extends CoarseGrainedClusterMessage
 
+  // Driver notifies AM when an executor has registered with the driver. Used for executor launch
+  // timeout tracking: AM stops tracking the executor for timeout once it receives this message.
+  case class ExecutorRegistered(executorId: String, resourceProfileId: Int)
+    extends CoarseGrainedClusterMessage
+
   case class RemoveWorker(workerId: String, host: String, message: String)
     extends CoarseGrainedClusterMessage
 
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -318,6 +318,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
               decommissionExecutors(Array((executorId, v._1)), v._2, v._3)
               unknownExecutorsPendingDecommission.invalidate(executorId)
             })
+          CoarseGrainedSchedulerBackend.this.onExecutorRegistered(executorId, resourceProfileId)
           context.reply(true)
         }
 
@@ -754,6 +755,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
       !executorsPendingDecommission.contains(id)
   }
 
+  override def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit = {}
+
   /**
    * Get the max number of tasks that can be concurrent launched based on the ResourceProfile
    * could be used, even if some of them are being used at the moment.
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/ApplicationMaster.scala
@@ -848,6 +848,9 @@ private[spark] class ApplicationMaster(
           case None =>
             logWarning("Container allocator is not ready to find executor loss reasons yet.")
         }
+
+      case ExecutorRegistered(executorId, resourceProfileId) =>
+        Option(allocator).foreach(_.onExecutorRegistered(executorId, resourceProfileId))
     }
 
     override def onDisconnected(remoteAddress: RpcAddress): Unit = {
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/YarnAllocator.scala
@@ -96,8 +96,14 @@ private[yarn] class YarnAllocator(
   @GuardedBy("this")
   private val launchingExecutorContainerIds = collection.mutable.HashSet[ContainerId]()
 
+  // ResourceProfileId -> (executorId -> Option[(firstSeenMs, isRelaunching)])
+  // Tracks all running executors and their launch timeout state. Value: Some((firstSeenMs,
+  // isRelaunching)) = not yet confirmed by driver (firstSeenMs = launch time, isRelaunching =
+  // true if replacement already requested after timeout); None = confirmed by driver; entry
+  // removed when executor/container is lost.
   @GuardedBy("this")
-  private val runningExecutorsPerResourceProfileId = new HashMap[Int, mutable.Set[String]]()
+  private val runningExecutorsPerResourceProfileId =
+    new HashMap[Int, mutable.HashMap[String, Option[(Long, Boolean)]]]()
 
   @GuardedBy("this")
   private val numExecutorsStartingPerResourceProfileId = new HashMap[Int, AtomicInteger]
@@ -175,6 +181,8 @@ private[yarn] class YarnAllocator(
 
   private val memoryOverheadFactor = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD_FACTOR)
 
+  private val maxDelayLaunchMillis = sparkConf.get(CONTAINER_LAUNCH_TIMEOUT)
+
   private val launcherPool = ThreadUtils.newDaemonCachedThreadPool(
     "ContainerLauncher", sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS))
 
@@ -216,7 +224,8 @@ private[yarn] class YarnAllocator(
   private def initDefaultProfile(): Unit = synchronized {
     allocatedHostToContainersMapPerRPId(DEFAULT_RESOURCE_PROFILE_ID) =
       new HashMap[String, mutable.Set[ContainerId]]()
-    runningExecutorsPerResourceProfileId.put(DEFAULT_RESOURCE_PROFILE_ID, mutable.HashSet[String]())
+    runningExecutorsPerResourceProfileId.put(DEFAULT_RESOURCE_PROFILE_ID,
+      mutable.HashMap[String, Option[(Long, Boolean)]]())
     numExecutorsStartingPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID) = new AtomicInteger(0)
     val initTargetExecNum = SchedulerBackendUtils.getInitialTargetExecutorNumber(sparkConf)
     targetNumExecutorsPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID) = initTargetExecNum
@@ -280,8 +289,44 @@ private[yarn] class YarnAllocator(
       new HashMap[String, mutable.Set[ContainerId]]())
   }
 
-  private def getOrUpdateRunningExecutorForRPId(rpId: Int): mutable.Set[String] = synchronized {
-    runningExecutorsPerResourceProfileId.getOrElseUpdate(rpId, mutable.HashSet[String]())
+  private def getOrUpdateRunningExecutorForRPId(
+      rpId: Int): mutable.HashMap[String, Option[(Long, Boolean)]] = synchronized {
+    runningExecutorsPerResourceProfileId.getOrElseUpdate(rpId,
+      mutable.HashMap[String, Option[(Long, Boolean)]]())
+  }
+
+  /** Count executors in the given map that are marked for relaunch (isRelaunching = true). */
+  private def countRelaunching(map: mutable.HashMap[String, Option[(Long, Boolean)]]): Int = {
+    map.count { case (_, v) => v.exists { case (_, isRelaunching) => isRelaunching } }
+  }
+
+  def getNumExecutorsMissLaunched: Int = synchronized {
+    runningExecutorsPerResourceProfileId.values.map { m =>
+      m.count { case (_, v) => v.exists { case (_, isRelaunching) => !isRelaunching } }
+    }.sum
+  }
+
+  /**
+   * Updates launch timeout tracking: marks unconfirmed executors that exceed maxDelayLaunchMillis
+   * as needing relaunch. Returns the count of executors for which replacement has been requested.
+   */
+  private def updateAndGetRelaunchingCount(rpId: Int): Int = {
+    val running = getOrUpdateRunningExecutorForRPId(rpId)
+    val now = clock.getTimeMillis()
+    running.keys.toSeq.foreach { executorId =>
+      running(executorId).foreach { case (firstSeen, isRelaunching) =>
+        if (!isRelaunching && now - firstSeen > maxDelayLaunchMillis) {
+          val hostOpt = executorIdToContainer
+            .get(executorId)
+            .flatMap(c => allocatedContainerToHostMap.get(c.getId))
+          allocatorNodeHealthTracker.handleResourceAllocationFailure(hostOpt)
+          running(executorId) = Some((firstSeen, true))
+          logWarning(s"Requesting new resources since launching executor " +
+            s"$executorId takes more than $maxDelayLaunchMillis ms")
+        }
+      }
+    }
+    countRelaunching(running)
   }
 
   private def getOrUpdateNumExecutorsStartingForRPId(rpId: Int): AtomicInteger = synchronized {
@@ -428,6 +473,12 @@ private[yarn] class YarnAllocator(
     }
   }
 
+  private[yarn] def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit = {
+    synchronized {
+      getOrUpdateRunningExecutorForRPId(resourceProfileId).update(executorId, None)
+    }
+  }
+
   /**
    * Request resources such that, if YARN gives us all we ask for, we'll have a number of containers
    * equal to maxExecutors.
@@ -500,10 +551,12 @@ private[yarn] class YarnAllocator(
     val missingPerProfile = targetNumExecutorsPerResourceProfileId.map { case (rpId, targetNum) =>
       val starting = getOrUpdateNumExecutorsStartingForRPId(rpId).get
       val pending = pendingAllocatePerResourceProfileId.getOrElse(rpId, Seq.empty).size
+      val relaunching = updateAndGetRelaunchingCount(rpId)
       val running = getOrUpdateRunningExecutorForRPId(rpId).size
       logDebug(s"Updating resource requests for ResourceProfile id: $rpId, target: " +
-        s"$targetNum, pending: $pending, running: $running, executorsStarting: $starting")
-      (rpId, targetNum - pending - running - starting)
+        s"$targetNum, pending: $pending, running: $running, executorsStarting: $starting, " +
+        s"relaunchingExecutors: $relaunching")
+      (rpId, targetNum - pending - running - starting + relaunching)
     }.toMap
 
     missingPerProfile.foreach { case (rpId, missing) =>
@@ -776,7 +829,8 @@ private[yarn] class YarnAllocator(
       val containerCores = rp.getExecutorCores.getOrElse(defaultCores)
 
       val rpRunningExecs = getOrUpdateRunningExecutorForRPId(rpId).size
-      if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId)) {
+      val relaunchingCount = countRelaunching(getOrUpdateRunningExecutorForRPId(rpId))
+      if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId) + relaunchingCount) {
         getOrUpdateNumExecutorsStartingForRPId(rpId).incrementAndGet()
         launchingExecutorContainerIds.add(containerId)
         if (launchContainers) {
@@ -819,7 +873,8 @@ private[yarn] class YarnAllocator(
       } else {
         logInfo(log"Skip launching executorRunnable as running executors count: " +
           log"${MDC(LogKeys.COUNT, rpRunningExecs)} reached target executors count: " +
-          log"${MDC(LogKeys.NUM_EXECUTOR_TARGET, getOrUpdateTargetNumExecutorsForRPId(rpId))}.")
+          log"${MDC(LogKeys.NUM_EXECUTOR_TARGET, getOrUpdateTargetNumExecutorsForRPId(rpId))} " +
+          log"and relaunching count: ${MDC(LogKeys.COUNT, relaunchingCount)}.")
         internalReleaseContainer(container)
       }
     }
@@ -829,7 +884,7 @@ private[yarn] class YarnAllocator(
       container: Container): Unit = synchronized {
     val containerId = container.getId
     if (launchingExecutorContainerIds.contains(containerId)) {
-      getOrUpdateRunningExecutorForRPId(rpId).add(executorId)
+      getOrUpdateRunningExecutorForRPId(rpId)(executorId) = Some((clock.getTimeMillis(), false))
       executorIdToContainer(executorId) = container
       containerIdToExecutorIdAndResourceProfileId(containerId) = (executorId, rpId)
 
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config/package.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/deploy/yarn/config/package.scala
@@ -269,6 +269,16 @@ package object config extends Logging {
       .intConf
       .createWithDefault(25)
 
+  private[spark] val CONTAINER_LAUNCH_TIMEOUT =
+    ConfigBuilder("spark.yarn.containerLaunchTimeout")
+      .doc("Maximum time to wait for an executor to successfully launch before considering " +
+        "it stunk and requesting a replacement. This timeout helps detect stuck " +
+        "executor launches in YARN mode. If an executor takes longer than this timeout to " +
+        "launch, it will be marked for relaunch and the host may be marked as unhealthy.")
+      .version("3.2.0-sdi-136")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createWithDefaultString("10min")
+
   private[spark] val MAX_REPORTER_THREAD_FAILURES =
     ConfigBuilder("spark.yarn.scheduler.reporterThread.maxFailures")
       .version("1.2.0")
diff --git a/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala b/resource-managers/yarn/src/main/scala/org/apache/spark/scheduler/cluster/YarnSchedulerBackend.scala
@@ -170,6 +170,10 @@ private[spark] abstract class YarnSchedulerBackend(
     yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))
   }
 
+  override protected def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit = {
+    amEndpoint.foreach(_.send(ExecutorRegisteredWithDriver(executorId, resourceProfileId)))
+  }
+
   override def sufficientResourcesRegistered(): Boolean = {
     totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio
   }
diff --git a/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala b/resource-managers/yarn/src/test/scala/org/apache/spark/deploy/yarn/YarnAllocatorSuite.scala
@@ -904,4 +904,146 @@ class YarnAllocatorSuite extends SparkFunSuite
     handler.getNumExecutorsRunning should be(0)
     handler.getNumExecutorsStarting should be(0)
   }
+
+  test("SPARK-55974: should not mark executors for relaunch before timeout") {
+    val timeoutMs = 5000L
+    val (handler, _) = createAllocator(
+      maxExecutors = 2,
+      additionalConfigs = Map(CONTAINER_LAUNCH_TIMEOUT.key -> timeoutMs.toString))
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+
+    handler.handleAllocatedContainers(Array(container1, container2).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (2)
+
+    // Executors becoming missing (not confirmed by driver)
+    handler.updateResourceRequests()
+
+    // Advance time but not enough to trigger timeout
+    clock.advance(timeoutMs - 1000L)
+    handler.updateResourceRequests()
+
+    handler.getNumExecutorsRunning should be (2)
+    handler.getNumContainersPendingAllocate should be (0)
+  }
+
+  test("SPARK-55974: should mark executors for relaunch after timeout") {
+    val timeoutMs = 5000L
+    val (handler, _) = createAllocator(
+      maxExecutors = 2,
+      additionalConfigs = Map(CONTAINER_LAUNCH_TIMEOUT.key -> timeoutMs.toString))
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+
+    handler.handleAllocatedContainers(Array(container1, container2).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (2)
+
+    // Executors becoming missing (not confirmed by driver)
+    handler.updateResourceRequests()
+
+    // Advance time past timeout
+    clock.advance(timeoutMs + 1000L)
+    handler.updateResourceRequests()
+
+    // Should relaunch new containers
+    handler.getNumContainersPendingAllocate should be (2)
+    handler.getNumExecutorsRunning should be (2)
+  }
+
+  test("SPARK-55974: should handle mixed active and missing executors") {
+    val timeoutMs = 5000L
+    val (handler, _) = createAllocator(
+      maxExecutors = 3,
+      additionalConfigs = Map(CONTAINER_LAUNCH_TIMEOUT.key -> timeoutMs.toString))
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (3)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+    val container3 = createContainer("host3")
+
+    handler.handleAllocatedContainers(Array(container1, container2, container3).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (3)
+
+    // Only 1 executor is active, 2 are missing
+    handler.onExecutorRegistered("1", defaultRPId)
+    handler.updateResourceRequests()
+
+    // Advance time past timeout
+    clock.advance(timeoutMs + 1000L)
+    handler.updateResourceRequests()
+
+    // Should relaunch 2 new containers
+    handler.getNumContainersPendingAllocate should be (2)
+    handler.getNumExecutorsRunning should be (3)
+  }
+
+  test("SPARK-55974: should reset tracking when executors become active again") {
+    val timeoutMs = 5000L
+    val (handler, _) = createAllocator(
+      maxExecutors = 2,
+      additionalConfigs = Map(CONTAINER_LAUNCH_TIMEOUT.key -> timeoutMs.toString))
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (2)
+
+    val container1 = createContainer("host1")
+    val container2 = createContainer("host2")
+
+    handler.handleAllocatedContainers(Array(container1, container2).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (2)
+
+    handler.updateResourceRequests()
+    handler.getNumExecutorsMissLaunched should be (2)
+
+    clock.advance(timeoutMs - 1000L)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsMissLaunched should be (2)
+
+    // Executors become active (confirmed by driver)
+    handler.onExecutorRegistered("1", defaultRPId)
+    handler.onExecutorRegistered("2", defaultRPId)
+    handler.updateResourceRequests()
+    handler.getNumExecutorsMissLaunched should be (0)
+
+    // Should not request new containers since executors are active again
+    handler.getNumContainersPendingAllocate should be (0)
+  }
+
+  test("SPARK-55974: executor launch timeout - should handle multiple timeout cycles") {
+    val timeoutMs = 3000L
+    val (handler, _) = createAllocator(
+      maxExecutors = 1,
+      additionalConfigs = Map(CONTAINER_LAUNCH_TIMEOUT.key -> timeoutMs.toString))
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (1)
+
+    val container = createContainer("host1")
+    handler.handleAllocatedContainers(Array(container).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (1)
+
+    // First timeout cycle
+    handler.updateResourceRequests()
+    clock.advance(timeoutMs + 1000L)
+    handler.updateResourceRequests()
+    handler.getNumContainersPendingAllocate should be (1)
+
+    // Simulate new container allocated
+    val newContainer = createContainer("host2")
+    handler.handleAllocatedContainers(Array(newContainer).toImmutableArraySeq)
+    handler.getNumExecutorsRunning should be (2)
+
+    // Second timeout cycle
+    handler.updateResourceRequests()
+    clock.advance(timeoutMs + 1000L)
+    handler.updateResourceRequests()
+
+    // Should request another container
+    handler.getNumContainersPendingAllocate should be (1)
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -318,6 +318,7 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp`
`318`	`318`	`decommissionExecutors(Array((executorId, v._1)), v._2, v._3)`
`319`	`319`	`unknownExecutorsPendingDecommission.invalidate(executorId)`
`320`	`320`	`})`
	`321`	`+ CoarseGrainedSchedulerBackend.this.onExecutorRegistered(executorId, resourceProfileId)`
`321`	`322`	`context.reply(true)`
`322`	`323`	`}`
`323`	`324`
`@@ -754,6 +755,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp`
`754`	`755`	`!executorsPendingDecommission.contains(id)`
`755`	`756`	`}`
`756`	`757`
	`758`	`+ override def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit = {}`
	`759`	`+`
`757`	`760`	`/**`
`758`	`761`	`* Get the max number of tasks that can be concurrent launched based on the ResourceProfile`
`759`	`762`	`* could be used, even if some of them are being used at the moment.`
Original file line number	Diff line number	Diff line change
`@@ -848,6 +848,9 @@ private[spark] class ApplicationMaster(`
`848`	`848`	`case None =>`
`849`	`849`	`logWarning("Container allocator is not ready to find executor loss reasons yet.")`
`850`	`850`	`}`
	`851`	`+`
	`852`	`+ case ExecutorRegistered(executorId, resourceProfileId) =>`
	`853`	`+ Option(allocator).foreach(_.onExecutorRegistered(executorId, resourceProfileId))`
`851`	`854`	`}`
`852`	`855`
`853`	`856`	`override def onDisconnected(remoteAddress: RpcAddress): Unit = {`
Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,10 @@ private[spark] abstract class YarnSchedulerBackend(`
`170`	`170`	`yarnSchedulerEndpointRef.ask[Boolean](KillExecutors(executorIds))`
`171`	`171`	`}`
`172`	`172`
	`173`	`+ override protected def onExecutorRegistered(executorId: String, resourceProfileId: Int): Unit = {`
	`174`	`+ amEndpoint.foreach(_.send(ExecutorRegisteredWithDriver(executorId, resourceProfileId)))`
	`175`	`+ }`
	`176`	`+`
`173`	`177`	`override def sufficientResourcesRegistered(): Boolean = {`
`174`	`178`	`totalRegisteredExecutors.get() >= totalExpectedExecutors * minRegisteredRatio`
`175`	`179`	`}`