@@ -350,11 +350,20 @@ private[spark] class AppStatusListener(
350
350
val e = it.next()
351
351
if (job.stageIds.contains(e.getKey()._1)) {
352
352
val stage = e.getValue()
353
- stage.status = v1.StageStatus .SKIPPED
354
- job.skippedStages += stage.info.stageId
355
- job.skippedTasks += stage.info.numTasks
356
- it.remove()
357
- update(stage, now)
353
+ if (v1.StageStatus .PENDING .equals(stage.status)) {
354
+ stage.status = v1.StageStatus .SKIPPED
355
+ job.skippedStages += stage.info.stageId
356
+ job.skippedTasks += stage.info.numTasks
357
+ job.activeStages -= 1
358
+
359
+ pools.get(stage.schedulingPool).foreach { pool =>
360
+ pool.stageIds = pool.stageIds - stage.info.stageId
361
+ update(pool, now)
362
+ }
363
+
364
+ it.remove()
365
+ update(stage, now, last = true )
366
+ }
358
367
}
359
368
}
360
369
@@ -506,7 +515,16 @@ private[spark] class AppStatusListener(
506
515
if (killedDelta > 0 ) {
507
516
stage.killedSummary = killedTasksSummary(event.reason, stage.killedSummary)
508
517
}
509
- maybeUpdate(stage, now)
518
+ // [SPARK-24415] Wait for all tasks to finish before removing stage from live list
519
+ val removeStage =
520
+ stage.activeTasks == 0 &&
521
+ (v1.StageStatus .COMPLETE .equals(stage.status) ||
522
+ v1.StageStatus .FAILED .equals(stage.status))
523
+ if (removeStage) {
524
+ update(stage, now, last = true )
525
+ } else {
526
+ maybeUpdate(stage, now)
527
+ }
510
528
511
529
// Store both stage ID and task index in a single long variable for tracking at job level.
512
530
val taskIndex = (event.stageId.toLong << Integer .SIZE ) | event.taskInfo.index
@@ -521,7 +539,7 @@ private[spark] class AppStatusListener(
521
539
if (killedDelta > 0 ) {
522
540
job.killedSummary = killedTasksSummary(event.reason, job.killedSummary)
523
541
}
524
- maybeUpdate (job, now)
542
+ conditionalLiveUpdate (job, now, removeStage )
525
543
}
526
544
527
545
val esummary = stage.executorSummary(event.taskInfo.executorId)
@@ -532,14 +550,17 @@ private[spark] class AppStatusListener(
532
550
if (metricsDelta != null ) {
533
551
esummary.metrics = LiveEntityHelpers .addMetrics(esummary.metrics, metricsDelta)
534
552
}
535
- maybeUpdate (esummary, now)
553
+ conditionalLiveUpdate (esummary, now, removeStage )
536
554
537
555
if (! stage.cleaning && stage.savedTasks.get() > maxTasksPerStage) {
538
556
stage.cleaning = true
539
557
kvstore.doAsync {
540
558
cleanupTasks(stage)
541
559
}
542
560
}
561
+ if (removeStage) {
562
+ liveStages.remove((event.stageId, event.stageAttemptId))
563
+ }
543
564
}
544
565
545
566
liveExecutors.get(event.taskInfo.executorId).foreach { exec =>
@@ -564,17 +585,13 @@ private[spark] class AppStatusListener(
564
585
565
586
// Force an update on live applications when the number of active tasks reaches 0. This is
566
587
// checked in some tests (e.g. SQLTestUtilsBase) so it needs to be reliably up to date.
567
- if (exec.activeTasks == 0 ) {
568
- liveUpdate(exec, now)
569
- } else {
570
- maybeUpdate(exec, now)
571
- }
588
+ conditionalLiveUpdate(exec, now, exec.activeTasks == 0 )
572
589
}
573
590
}
574
591
575
592
override def onStageCompleted (event : SparkListenerStageCompleted ): Unit = {
576
593
val maybeStage =
577
- Option (liveStages.remove ((event.stageInfo.stageId, event.stageInfo.attemptNumber)))
594
+ Option (liveStages.get ((event.stageInfo.stageId, event.stageInfo.attemptNumber)))
578
595
maybeStage.foreach { stage =>
579
596
val now = System .nanoTime()
580
597
stage.info = event.stageInfo
@@ -608,14 +625,20 @@ private[spark] class AppStatusListener(
608
625
}
609
626
610
627
stage.executorSummaries.values.foreach(update(_, now))
611
- update(stage, now, last = true )
612
628
613
629
val executorIdsForStage = stage.blackListedExecutors
614
630
executorIdsForStage.foreach { executorId =>
615
631
liveExecutors.get(executorId).foreach { exec =>
616
632
removeBlackListedStageFrom(exec, event.stageInfo.stageId, now)
617
633
}
618
634
}
635
+
636
+ // Remove stage only if there are no active tasks remaining
637
+ val removeStage = stage.activeTasks == 0
638
+ update(stage, now, last = removeStage)
639
+ if (removeStage) {
640
+ liveStages.remove((event.stageInfo.stageId, event.stageInfo.attemptNumber))
641
+ }
619
642
}
620
643
621
644
appSummary = new AppSummary (appSummary.numCompletedJobs, appSummary.numCompletedStages + 1 )
@@ -882,6 +905,14 @@ private[spark] class AppStatusListener(
882
905
}
883
906
}
884
907
908
+ private def conditionalLiveUpdate (entity : LiveEntity , now : Long , condition : Boolean ): Unit = {
909
+ if (condition) {
910
+ liveUpdate(entity, now)
911
+ } else {
912
+ maybeUpdate(entity, now)
913
+ }
914
+ }
915
+
885
916
private def cleanupExecutors (count : Long ): Unit = {
886
917
// Because the limit is on the number of *dead* executors, we need to calculate whether
887
918
// there are actually enough dead executors to be deleted.
0 commit comments