ifilonenko
diff --git a/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
Lines changed: 38 additions & 20 deletions b/‎core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
Lines changed: 38 additions & 20 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala
Lines changed: 88 additions & 4 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputTracker.scala
Lines changed: 88 additions & 4 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala
Lines changed: 1 addition & 0 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
Lines changed: 30 additions & 11 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
Lines changed: 30 additions & 11 deletions
@@ -25,7 +25,7 @@ import scala.util.control.{ControlThrowable, NonFatal}
 
 import com.codahale.metrics.{Gauge, MetricRegistry}
 
-import org.apache.spark.internal.{config, Logging}
+import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
 import org.apache.spark.metrics.source.Source
 import org.apache.spark.scheduler._
@@ -87,6 +87,7 @@ private[spark] class ExecutorAllocationManager(
     client: ExecutorAllocationClient,
     listenerBus: LiveListenerBus,
     conf: SparkConf,
+    mapOutputTracker: MapOutputTrackerMaster,
     blockManagerMaster: BlockManagerMaster)
   extends Logging {
 
@@ -114,6 +115,9 @@ private[spark] class ExecutorAllocationManager(
   private val cachedExecutorIdleTimeoutS = conf.getTimeAsSeconds(
     "spark.dynamicAllocation.cachedExecutorIdleTimeout", s"${Integer.MAX_VALUE}s")
 
+  private val inactiveShuffleExecutorIdleTimeoutS = conf.getTimeAsSeconds(
+    "spark.dynamicAllocation.inactiveShuffleExecutorIdleTimeout", s"${Integer.MAX_VALUE}s")
+
   // During testing, the methods to actually kill and add executors are mocked out
   private val testing = conf.getBoolean("spark.dynamicAllocation.testing", false)
 
@@ -210,12 +214,6 @@ private[spark] class ExecutorAllocationManager(
     if (cachedExecutorIdleTimeoutS < 0) {
       throw new SparkException("spark.dynamicAllocation.cachedExecutorIdleTimeout must be >= 0!")
     }
-    // Require external shuffle service for dynamic allocation
-    // Otherwise, we may lose shuffle files when killing executors
-    if (!conf.get(config.SHUFFLE_SERVICE_ENABLED) && !testing) {
-      throw new SparkException("Dynamic allocation of executors requires the external " +
-        "shuffle service. You may enable this through spark.shuffle.service.enabled.")
-    }
     if (tasksPerExecutorForFullParallelism == 0) {
       throw new SparkException("spark.executor.cores must not be < spark.task.cpus.")
     }
@@ -546,7 +544,7 @@ private[spark] class ExecutorAllocationManager(
       // has been reached, it will no longer be marked as idle. When new executors join,
       // however, we are no longer at the lower bound, and so we must mark executor X
       // as idle again so as not to forget that it is a candidate for removal. (see SPARK-4951)
-      executorIds.filter(listener.isExecutorIdle).foreach(onExecutorIdle)
+      checkForIdleExecutors()
       logInfo(s"New executor $executorId has registered (new total is ${executorIds.size})")
     } else {
       logWarning(s"Duplicate executor $executorId has registered")
@@ -601,30 +599,44 @@ private[spark] class ExecutorAllocationManager(
    */
   private def onExecutorIdle(executorId: String): Unit = synchronized {
     if (executorIds.contains(executorId)) {
-      if (!removeTimes.contains(executorId) && !executorsPendingToRemove.contains(executorId)) {
+      val hasActiveShuffleBlocks =
+        mapOutputTracker.hasOutputsOnExecutor(executorId, activeOnly = true)
+      if (!removeTimes.contains(executorId)
+        && !executorsPendingToRemove.contains(executorId)
+        && !hasActiveShuffleBlocks) {
         // Note that it is not necessary to query the executors since all the cached
         // blocks we are concerned with are reported to the driver. Note that this
         // does not include broadcast blocks.
         val hasCachedBlocks = blockManagerMaster.hasCachedBlocks(executorId)
+        val hasAnyShuffleBlocks = mapOutputTracker.hasOutputsOnExecutor(executorId)
         val now = clock.getTimeMillis()
-        val timeout = {
-          if (hasCachedBlocks) {
-            // Use a different timeout if the executor has cached blocks.
-            now + cachedExecutorIdleTimeoutS * 1000
-          } else {
-            now + executorIdleTimeoutS * 1000
-          }
-        }
-        val realTimeout = if (timeout <= 0) Long.MaxValue else timeout // overflow
-        removeTimes(executorId) = realTimeout
+
+        // Use the maximum of all the timeouts that apply.
+        val timeoutS = List(
+          executorIdleTimeoutS,
+          if (hasCachedBlocks) cachedExecutorIdleTimeoutS else 0,
+          if (hasAnyShuffleBlocks) inactiveShuffleExecutorIdleTimeoutS else 0)
+          .max
+
+        val expiryTime = now + timeoutS * 1000;
+        val realExpiryTime = if (expiryTime <= 0) Long.MaxValue else expiryTime
+
+        removeTimes(executorId) = realExpiryTime
         logDebug(s"Starting idle timer for $executorId because there are no more tasks " +
-          s"scheduled to run on the executor (to expire in ${(realTimeout - now)/1000} seconds)")
+          s"scheduled to run on the executor (to expire in ${(realExpiryTime - now)/1000} seconds)")
       }
     } else {
       logWarning(s"Attempted to mark unknown executor $executorId idle")
     }
   }
 
+  /**
+   * Check if any executors are now idle, and call the idle callback for them.
+   */
+  private def checkForIdleExecutors(): Unit = synchronized {
+    executorIds.filter(listener.isExecutorIdle).foreach(onExecutorIdle)
+  }
+
   /**
    * Callback invoked when the specified executor is now running a task.
    * This resets all variables used for removing this executor.
@@ -659,6 +671,12 @@ private[spark] class ExecutorAllocationManager(
     // place the executors.
     private val stageIdToExecutorPlacementHints = new mutable.HashMap[Int, (Int, Map[String, Int])]
 
+    override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
+      // At the end of a job, trigger the callbacks for idle executors again to clean up executors
+      // which we were keeping around only because they held active shuffle blocks.
+      allocationManager.checkForIdleExecutors()
+    }
+
     override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
       initializing = false
       val stageId = stageSubmitted.stageInfo.stageId
 
@@ -28,6 +28,7 @@ import scala.concurrent.duration.Duration
 import scala.reflect.ClassTag
 import scala.util.control.NonFatal
 
+import org.apache.spark.ExecutorShuffleStatus.ExecutorShuffleStatus
 import org.apache.spark.broadcast.{Broadcast, BroadcastManager}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
@@ -61,6 +62,13 @@ private class ShuffleStatus(numPartitions: Int) {
   // Exposed for testing
   val mapStatuses = new Array[MapStatus](numPartitions)
 
+  /**
+   * Whether an active job in the [[org.apache.spark.scheduler.DAGScheduler]] depends on this.
+   * If dynamic allocation is enabled, then executors that do not contain active shuffles may
+   * eventually be surrendered by the [[ExecutorAllocationManager]].
+   */
+  var isActive = true
+
   /**
    * The cached result of serializing the map statuses array. This cache is lazily populated when
    * [[serializedMapStatus]] is called. The cache is invalidated when map outputs are removed.
@@ -80,17 +88,24 @@ private class ShuffleStatus(numPartitions: Int) {
   /**
    * Counter tracking the number of partitions that have output. This is a performance optimization
    * to avoid having to count the number of non-null entries in the `mapStatuses` array and should
-   * be equivalent to`mapStatuses.count(_ ne null)`.
+   * be equivalent to `mapStatuses.count(_ ne null)`.
    */
   private[this] var _numAvailableOutputs: Int = 0
 
+  /**
+   * Cached set of executorIds on which outputs exist. This is a performance optimization to avoid
+   * having to repeatedly iterate over ever element in the `mapStatuses` array and should be
+   * equivalent to `mapStatuses.map(_.location.executorId).groupBy(x => x).mapValues(_.length)`.
+   */
+  private[this] val _numOutputsPerExecutorId = HashMap[String, Int]().withDefaultValue(0)
+
   /**
    * Register a map output. If there is already a registered location for the map output then it
    * will be replaced by the new location.
    */
   def addMapOutput(mapId: Int, status: MapStatus): Unit = synchronized {
     if (mapStatuses(mapId) == null) {
-      _numAvailableOutputs += 1
+      incrementNumAvailableOutputs(status.location)
       invalidateSerializedMapOutputStatusCache()
     }
     mapStatuses(mapId) = status
@@ -103,7 +118,7 @@ private class ShuffleStatus(numPartitions: Int) {
    */
   def removeMapOutput(mapId: Int, bmAddress: BlockManagerId): Unit = synchronized {
     if (mapStatuses(mapId) != null && mapStatuses(mapId).location == bmAddress) {
-      _numAvailableOutputs -= 1
+      decrementNumAvailableOutputs(bmAddress)
       mapStatuses(mapId) = null
       invalidateSerializedMapOutputStatusCache()
     }
@@ -133,13 +148,21 @@ private class ShuffleStatus(numPartitions: Int) {
   def removeOutputsByFilter(f: (BlockManagerId) => Boolean): Unit = synchronized {
     for (mapId <- 0 until mapStatuses.length) {
       if (mapStatuses(mapId) != null && f(mapStatuses(mapId).location)) {
-        _numAvailableOutputs -= 1
+        decrementNumAvailableOutputs(mapStatuses(mapId).location)
         mapStatuses(mapId) = null
         invalidateSerializedMapOutputStatusCache()
       }
     }
   }
 
+  def hasOutputsOnExecutor(execId: String): Boolean = synchronized {
+    _numOutputsPerExecutorId(execId) > 0
+  }
+
+  def executorsWithOutputs(): Set[String] = synchronized {
+    _numOutputsPerExecutorId.keySet.toSet
+  }
+
   /**
    * Number of partitions that have shuffle outputs.
    */
@@ -192,6 +215,22 @@ private class ShuffleStatus(numPartitions: Int) {
     f(mapStatuses)
   }
 
+  private[this] def incrementNumAvailableOutputs(bmAddress: BlockManagerId): Unit = synchronized {
+    _numOutputsPerExecutorId(bmAddress.executorId) += 1
+    _numAvailableOutputs += 1
+  }
+
+  private[this] def decrementNumAvailableOutputs(bmAddress: BlockManagerId): Unit = synchronized {
+    assert(_numOutputsPerExecutorId(bmAddress.executorId) >= 1,
+      s"Tried to remove non-existent output from ${bmAddress.executorId}")
+    if (_numOutputsPerExecutorId(bmAddress.executorId) == 1) {
+      _numOutputsPerExecutorId.remove(bmAddress.executorId)
+    } else {
+      _numOutputsPerExecutorId(bmAddress.executorId) -= 1
+    }
+    _numAvailableOutputs -= 1
+  }
+
   /**
    * Clears the cached serialized map output statuses.
    */
@@ -306,6 +345,11 @@ private[spark] abstract class MapOutputTracker(conf: SparkConf) extends Logging
   def stop() {}
 }
 
+private[spark] object ExecutorShuffleStatus extends Enumeration {
+  type ExecutorShuffleStatus = Value
+  val Active, Inactive, Unknown = Value
+}
+
 /**
  * Driver-side class that keeps track of the location of the map output of a stage.
  *
@@ -453,6 +497,26 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
+  def markShuffleInactive(shuffleId: Int): Unit = {
+    shuffleStatuses.get(shuffleId) match {
+      case Some(shuffleStatus) =>
+        shuffleStatus.isActive = false
+      case None =>
+        throw new SparkException(
+          s"markShuffleInactive called for nonexistent shuffle ID $shuffleId.")
+    }
+  }
+
+  def markShuffleActive(shuffleId: Int): Unit = {
+    shuffleStatuses.get(shuffleId) match {
+      case Some(shuffleStatus) =>
+        shuffleStatus.isActive = true
+      case None =>
+        throw new SparkException(
+          s"markShuffleActive called for nonexistent shuffle ID $shuffleId.")
+    }
+  }
+
   /**
    * Removes all shuffle outputs associated with this host. Note that this will also remove
    * outputs which are served by an external shuffle server (if one exists).
@@ -472,6 +536,12 @@ private[spark] class MapOutputTrackerMaster(
     incrementEpoch()
   }
 
+  def hasOutputsOnExecutor(execId: String, activeOnly: Boolean = false): Boolean = {
+    shuffleStatuses.valuesIterator.exists { status =>
+      status.hasOutputsOnExecutor(execId) && (!activeOnly || status.isActive)
+    }
+  }
+
   /** Check if the given shuffle is being tracked */
   def containsShuffle(shuffleId: Int): Boolean = shuffleStatuses.contains(shuffleId)
 
@@ -577,6 +647,20 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
+  /**
+   * Return the set of executors that contain tracked shuffle files, with a status of
+   * [[ExecutorShuffleStatus.Inactive]] iff all shuffle files on that executor are marked inactive.
+   *
+   * @return a map of executor IDs to their corresponding [[ExecutorShuffleStatus]]
+   */
+  def getExecutorShuffleStatus: scala.collection.Map[String, ExecutorShuffleStatus] = {
+    shuffleStatuses.values
+      .flatMap(status => status.executorsWithOutputs().map(_ -> status.isActive))
+      .groupBy(_._1)
+      .mapValues(_.exists(_._2))
+      .mapValues(if (_) ExecutorShuffleStatus.Active else ExecutorShuffleStatus.Inactive)
+  }
+
   /**
    * Return a list of locations that each have fraction of map output greater than the specified
    * threshold.
 
@@ -546,6 +546,7 @@ class SparkContext(config: SparkConf) extends Logging {
           case b: ExecutorAllocationClient =>
             Some(new ExecutorAllocationManager(
               schedulerBackend.asInstanceOf[ExecutorAllocationClient], listenerBus, _conf,
+              _env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster],
               _env.blockManager.master))
           case _ =>
             None
 
@@ -397,7 +397,9 @@ private[spark] class DAGScheduler(
     shuffleIdToMapStage(shuffleDep.shuffleId) = stage
     updateJobIdStageIdMaps(jobId, stage)
 
-    if (!mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
+    if (mapOutputTracker.containsShuffle(shuffleDep.shuffleId)) {
+      mapOutputTracker.markShuffleActive(shuffleDep.shuffleId)
+    } else {
       // Kind of ugly: need to register RDDs with the cache and map output tracker here
       // since we can't do it in the RDD constructor because # of partitions is unknown
       logInfo("Registering RDD " + rdd.id + " (" + rdd.getCreationSite + ")")
@@ -627,6 +629,7 @@ private[spark] class DAGScheduler(
                 }
                 for ((k, v) <- shuffleIdToMapStage.find(_._2 == stage)) {
                   shuffleIdToMapStage.remove(k)
+                  mapOutputTracker.markShuffleInactive(k)
                 }
                 if (waitingStages.contains(stage)) {
                   logDebug("Removing stage %d from waiting set.".format(stageId))
@@ -1367,6 +1370,31 @@ private[spark] class DAGScheduler(
       case _: ExceptionFailure | _: TaskKilled => updateAccumulators(event)
       case _ =>
     }
+
+    // Make sure shuffle outputs are registered before we post the event so that
+    // handlers can act on up-to-date shuffle information.
+    event.reason match {
+      case Success =>
+        task match {
+          case smt: ShuffleMapTask =>
+            val shuffleStage = stage.asInstanceOf[ShuffleMapStage]
+            val status = event.result.asInstanceOf[MapStatus]
+            val execId = status.location.executorId
+            logDebug("Registering shuffle output on executor " + execId)
+            if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
+              logInfo(s"Ignoring possibly bogus $smt completion from executor $execId")
+            } else {
+              // The epoch of the task is acceptable (i.e., the task was launched after the most
+              // recent failure we're aware of for the executor), so mark the task's output as
+              // available.
+              mapOutputTracker.registerMapOutput(
+                shuffleStage.shuffleDep.shuffleId, smt.partitionId, status)
+            }
+          case _ =>
+        }
+      case _ =>
+    }
+
     postTaskEnd(event)
 
     event.reason match {
@@ -1418,21 +1446,12 @@ private[spark] class DAGScheduler(
                 logInfo("Ignoring result from " + rt + " because its job has finished")
             }
 
-          case smt: ShuffleMapTask =>
+          case _: ShuffleMapTask =>
             val shuffleStage = stage.asInstanceOf[ShuffleMapStage]
             shuffleStage.pendingPartitions -= task.partitionId
             val status = event.result.asInstanceOf[MapStatus]
             val execId = status.location.executorId
             logDebug("ShuffleMapTask finished on " + execId)
-            if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) {
-              logInfo(s"Ignoring possibly bogus $smt completion from executor $execId")
-            } else {
-              // The epoch of the task is acceptable (i.e., the task was launched after the most
-              // recent failure we're aware of for the executor), so mark the task's output as
-              // available.
-              mapOutputTracker.registerMapOutput(
-                shuffleStage.shuffleDep.shuffleId, smt.partitionId, status)
-            }
 
             if (runningStages.contains(shuffleStage) && shuffleStage.pendingPartitions.isEmpty) {
               markStageAsFinished(shuffleStage)