@@ -81,7 +81,8 @@ private[yarn] class YarnAllocator(
81
81
private val releasedContainers = Collections .newSetFromMap[ContainerId ](
82
82
new ConcurrentHashMap [ContainerId , java.lang.Boolean ])
83
83
84
- private val numExecutorsRunning = new AtomicInteger (0 )
84
+ private val runningExecutors = Collections .newSetFromMap[String ](
85
+ new ConcurrentHashMap [String , java.lang.Boolean ]())
85
86
86
87
private val numExecutorsStarting = new AtomicInteger (0 )
87
88
@@ -166,7 +167,7 @@ private[yarn] class YarnAllocator(
166
167
clock = newClock
167
168
}
168
169
169
- def getNumExecutorsRunning : Int = numExecutorsRunning.get ()
170
+ def getNumExecutorsRunning : Int = runningExecutors.size ()
170
171
171
172
def getNumExecutorsFailed : Int = synchronized {
172
173
val endTime = clock.getTimeMillis()
@@ -242,12 +243,11 @@ private[yarn] class YarnAllocator(
242
243
* Request that the ResourceManager release the container running the specified executor.
243
244
*/
244
245
def killExecutor (executorId : String ): Unit = synchronized {
245
- if (executorIdToContainer.contains(executorId)) {
246
- val container = executorIdToContainer.get(executorId).get
247
- internalReleaseContainer(container)
248
- numExecutorsRunning.decrementAndGet()
249
- } else {
250
- logWarning(s " Attempted to kill unknown executor $executorId! " )
246
+ executorIdToContainer.get(executorId) match {
247
+ case Some (container) if ! releasedContainers.contains(container.getId) =>
248
+ internalReleaseContainer(container)
249
+ runningExecutors.remove(executorId)
250
+ case _ => logWarning(s " Attempted to kill unknown executor $executorId! " )
251
251
}
252
252
}
253
253
@@ -274,7 +274,7 @@ private[yarn] class YarnAllocator(
274
274
" Launching executor count: %d. Cluster resources: %s." )
275
275
.format(
276
276
allocatedContainers.size,
277
- numExecutorsRunning.get ,
277
+ runningExecutors.size ,
278
278
numExecutorsStarting.get,
279
279
allocateResponse.getAvailableResources))
280
280
@@ -286,7 +286,7 @@ private[yarn] class YarnAllocator(
286
286
logDebug(" Completed %d containers" .format(completedContainers.size))
287
287
processCompletedContainers(completedContainers.asScala)
288
288
logDebug(" Finished processing %d completed containers. Current running executor count: %d."
289
- .format(completedContainers.size, numExecutorsRunning.get ))
289
+ .format(completedContainers.size, runningExecutors.size ))
290
290
}
291
291
}
292
292
@@ -300,9 +300,9 @@ private[yarn] class YarnAllocator(
300
300
val pendingAllocate = getPendingAllocate
301
301
val numPendingAllocate = pendingAllocate.size
302
302
val missing = targetNumExecutors - numPendingAllocate -
303
- numExecutorsStarting.get - numExecutorsRunning.get
303
+ numExecutorsStarting.get - runningExecutors.size
304
304
logDebug(s " Updating resource requests, target: $targetNumExecutors, " +
305
- s " pending: $numPendingAllocate, running: ${numExecutorsRunning.get }, " +
305
+ s " pending: $numPendingAllocate, running: ${runningExecutors.size }, " +
306
306
s " executorsStarting: ${numExecutorsStarting.get}" )
307
307
308
308
if (missing > 0 ) {
@@ -502,7 +502,7 @@ private[yarn] class YarnAllocator(
502
502
s " for executor with ID $executorId" )
503
503
504
504
def updateInternalState (): Unit = synchronized {
505
- numExecutorsRunning.incrementAndGet( )
505
+ runningExecutors.add(executorId )
506
506
numExecutorsStarting.decrementAndGet()
507
507
executorIdToContainer(executorId) = container
508
508
containerIdToExecutorId(container.getId) = executorId
@@ -513,7 +513,7 @@ private[yarn] class YarnAllocator(
513
513
allocatedContainerToHostMap.put(containerId, executorHostname)
514
514
}
515
515
516
- if (numExecutorsRunning.get < targetNumExecutors) {
516
+ if (runningExecutors.size() < targetNumExecutors) {
517
517
numExecutorsStarting.incrementAndGet()
518
518
if (launchContainers) {
519
519
launcherPool.execute(new Runnable {
@@ -554,7 +554,7 @@ private[yarn] class YarnAllocator(
554
554
} else {
555
555
logInfo((" Skip launching executorRunnable as running executors count: %d " +
556
556
" reached target executors count: %d." ).format(
557
- numExecutorsRunning.get , targetNumExecutors))
557
+ runningExecutors.size , targetNumExecutors))
558
558
}
559
559
}
560
560
}
@@ -569,7 +569,11 @@ private[yarn] class YarnAllocator(
569
569
val exitReason = if (! alreadyReleased) {
570
570
// Decrement the number of executors running. The next iteration of
571
571
// the ApplicationMaster's reporting thread will take care of allocating.
572
- numExecutorsRunning.decrementAndGet()
572
+ containerIdToExecutorId.get(containerId) match {
573
+ case Some (executorId) => runningExecutors.remove(executorId)
574
+ case None => logWarning(s " Cannot find executorId for container: ${containerId.toString}" )
575
+ }
576
+
573
577
logInfo(" Completed container %s%s (state: %s, exit status: %s)" .format(
574
578
containerId,
575
579
onHostStr,
0 commit comments