@@ -96,8 +96,14 @@ private[yarn] class YarnAllocator(
9696 @ GuardedBy (" this" )
9797 private val launchingExecutorContainerIds = collection.mutable.HashSet [ContainerId ]()
9898
99+ // ResourceProfileId -> (executorId -> Option[(firstSeenMs, isRelaunching)])
100+ // Tracks all running executors and their launch timeout state. Value: Some((firstSeenMs,
101+ // isRelaunching)) = not yet confirmed by driver (firstSeenMs = launch time, isRelaunching =
102+ // true if replacement already requested after timeout); None = confirmed by driver; entry
103+ // removed when executor/container is lost.
99104 @ GuardedBy (" this" )
100- private val runningExecutorsPerResourceProfileId = new HashMap [Int , mutable.Set [String ]]()
105+ private val runningExecutorsPerResourceProfileId =
106+ new HashMap [Int , mutable.HashMap [String , Option [(Long , Boolean )]]]()
101107
102108 @ GuardedBy (" this" )
103109 private val numExecutorsStartingPerResourceProfileId = new HashMap [Int , AtomicInteger ]
@@ -175,6 +181,8 @@ private[yarn] class YarnAllocator(
175181
176182 private val memoryOverheadFactor = sparkConf.get(EXECUTOR_MEMORY_OVERHEAD_FACTOR )
177183
184+ private val maxDelayLaunchMillis = sparkConf.get(CONTAINER_LAUNCH_TIMEOUT )
185+
178186 private val launcherPool = ThreadUtils .newDaemonCachedThreadPool(
179187 " ContainerLauncher" , sparkConf.get(CONTAINER_LAUNCH_MAX_THREADS ))
180188
@@ -216,7 +224,8 @@ private[yarn] class YarnAllocator(
216224 private def initDefaultProfile (): Unit = synchronized {
217225 allocatedHostToContainersMapPerRPId(DEFAULT_RESOURCE_PROFILE_ID ) =
218226 new HashMap [String , mutable.Set [ContainerId ]]()
219- runningExecutorsPerResourceProfileId.put(DEFAULT_RESOURCE_PROFILE_ID , mutable.HashSet [String ]())
227+ runningExecutorsPerResourceProfileId.put(DEFAULT_RESOURCE_PROFILE_ID ,
228+ mutable.HashMap [String , Option [(Long , Boolean )]]())
220229 numExecutorsStartingPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID ) = new AtomicInteger (0 )
221230 val initTargetExecNum = SchedulerBackendUtils .getInitialTargetExecutorNumber(sparkConf)
222231 targetNumExecutorsPerResourceProfileId(DEFAULT_RESOURCE_PROFILE_ID ) = initTargetExecNum
@@ -280,8 +289,44 @@ private[yarn] class YarnAllocator(
280289 new HashMap [String , mutable.Set [ContainerId ]]())
281290 }
282291
283- private def getOrUpdateRunningExecutorForRPId (rpId : Int ): mutable.Set [String ] = synchronized {
284- runningExecutorsPerResourceProfileId.getOrElseUpdate(rpId, mutable.HashSet [String ]())
292+ private def getOrUpdateRunningExecutorForRPId (
293+ rpId : Int ): mutable.HashMap [String , Option [(Long , Boolean )]] = synchronized {
294+ runningExecutorsPerResourceProfileId.getOrElseUpdate(rpId,
295+ mutable.HashMap [String , Option [(Long , Boolean )]]())
296+ }
297+
298+ /** Count executors in the given map that are marked for relaunch (isRelaunching = true). */
299+ private def countRelaunching (map : mutable.HashMap [String , Option [(Long , Boolean )]]): Int = {
300+ map.count { case (_, v) => v.exists { case (_, isRelaunching) => isRelaunching } }
301+ }
302+
303+ def getNumExecutorsMissLaunched : Int = synchronized {
304+ runningExecutorsPerResourceProfileId.values.map { m =>
305+ m.count { case (_, v) => v.exists { case (_, isRelaunching) => ! isRelaunching } }
306+ }.sum
307+ }
308+
309+ /**
310+ * Updates launch timeout tracking: marks unconfirmed executors that exceed maxDelayLaunchMillis
311+ * as needing relaunch. Returns the count of executors for which replacement has been requested.
312+ */
313+ private def updateAndGetRelaunchingCount (rpId : Int ): Int = {
314+ val running = getOrUpdateRunningExecutorForRPId(rpId)
315+ val now = clock.getTimeMillis()
316+ running.keys.toSeq.foreach { executorId =>
317+ running(executorId).foreach { case (firstSeen, isRelaunching) =>
318+ if (! isRelaunching && now - firstSeen > maxDelayLaunchMillis) {
319+ val hostOpt = executorIdToContainer
320+ .get(executorId)
321+ .flatMap(c => allocatedContainerToHostMap.get(c.getId))
322+ allocatorNodeHealthTracker.handleResourceAllocationFailure(hostOpt)
323+ running(executorId) = Some ((firstSeen, true ))
324+ logWarning(s " Requesting new resources since launching executor " +
325+ s " $executorId takes more than $maxDelayLaunchMillis ms " )
326+ }
327+ }
328+ }
329+ countRelaunching(running)
285330 }
286331
287332 private def getOrUpdateNumExecutorsStartingForRPId (rpId : Int ): AtomicInteger = synchronized {
@@ -428,6 +473,12 @@ private[yarn] class YarnAllocator(
428473 }
429474 }
430475
476+ private [yarn] def onExecutorRegistered (executorId : String , resourceProfileId : Int ): Unit = {
477+ synchronized {
478+ getOrUpdateRunningExecutorForRPId(resourceProfileId).update(executorId, None )
479+ }
480+ }
481+
431482 /**
432483 * Request resources such that, if YARN gives us all we ask for, we'll have a number of containers
433484 * equal to maxExecutors.
@@ -500,10 +551,12 @@ private[yarn] class YarnAllocator(
500551 val missingPerProfile = targetNumExecutorsPerResourceProfileId.map { case (rpId, targetNum) =>
501552 val starting = getOrUpdateNumExecutorsStartingForRPId(rpId).get
502553 val pending = pendingAllocatePerResourceProfileId.getOrElse(rpId, Seq .empty).size
554+ val relaunching = updateAndGetRelaunchingCount(rpId)
503555 val running = getOrUpdateRunningExecutorForRPId(rpId).size
504556 logDebug(s " Updating resource requests for ResourceProfile id: $rpId, target: " +
505- s " $targetNum, pending: $pending, running: $running, executorsStarting: $starting" )
506- (rpId, targetNum - pending - running - starting)
557+ s " $targetNum, pending: $pending, running: $running, executorsStarting: $starting, " +
558+ s " relaunchingExecutors: $relaunching" )
559+ (rpId, targetNum - pending - running - starting + relaunching)
507560 }.toMap
508561
509562 missingPerProfile.foreach { case (rpId, missing) =>
@@ -776,7 +829,8 @@ private[yarn] class YarnAllocator(
776829 val containerCores = rp.getExecutorCores.getOrElse(defaultCores)
777830
778831 val rpRunningExecs = getOrUpdateRunningExecutorForRPId(rpId).size
779- if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId)) {
832+ val relaunchingCount = countRelaunching(getOrUpdateRunningExecutorForRPId(rpId))
833+ if (rpRunningExecs < getOrUpdateTargetNumExecutorsForRPId(rpId) + relaunchingCount) {
780834 getOrUpdateNumExecutorsStartingForRPId(rpId).incrementAndGet()
781835 launchingExecutorContainerIds.add(containerId)
782836 if (launchContainers) {
@@ -819,7 +873,8 @@ private[yarn] class YarnAllocator(
819873 } else {
820874 logInfo(log " Skip launching executorRunnable as running executors count: " +
821875 log " ${MDC (LogKeys .COUNT , rpRunningExecs)} reached target executors count: " +
822- log " ${MDC (LogKeys .NUM_EXECUTOR_TARGET , getOrUpdateTargetNumExecutorsForRPId(rpId))}. " )
876+ log " ${MDC (LogKeys .NUM_EXECUTOR_TARGET , getOrUpdateTargetNumExecutorsForRPId(rpId))} " +
877+ log " and relaunching count: ${MDC (LogKeys .COUNT , relaunchingCount)}. " )
823878 internalReleaseContainer(container)
824879 }
825880 }
@@ -829,7 +884,7 @@ private[yarn] class YarnAllocator(
829884 container : Container ): Unit = synchronized {
830885 val containerId = container.getId
831886 if (launchingExecutorContainerIds.contains(containerId)) {
832- getOrUpdateRunningExecutorForRPId(rpId).add (executorId)
887+ getOrUpdateRunningExecutorForRPId(rpId)(executorId) = Some ((clock.getTimeMillis(), false ) )
833888 executorIdToContainer(executorId) = container
834889 containerIdToExecutorIdAndResourceProfileId(containerId) = (executorId, rpId)
835890
0 commit comments