Skip to content

Commit b9bf045

Browse files
rynorrisRobert Kruszewski
authored andcommitted
Shuffle biased task scheduling (apache-spark-on-k8s#447)
1 parent 4b2ae5a commit b9bf045

File tree

4 files changed

+96
-19
lines changed

4 files changed

+96
-19
lines changed

core/src/main/scala/org/apache/spark/MapOutputTracker.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -656,8 +656,8 @@ private[spark] class MapOutputTrackerMaster(
656656
def getExecutorShuffleStatus: scala.collection.Map[String, ExecutorShuffleStatus] = {
657657
shuffleStatuses.values
658658
.flatMap(status => status.executorsWithOutputs().map(_ -> status.isActive))
659-
.groupBy(_._1)
660-
.mapValues(_.exists(_._2))
659+
.groupBy(_._1) // group by executor ID
660+
.mapValues(_.exists(_._2)) // true if any are Active
661661
.mapValues(if (_) ExecutorShuffleStatus.Active else ExecutorShuffleStatus.Inactive)
662662
}
663663

core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala

Lines changed: 56 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ private[spark] class TaskSchedulerImpl(
8181
private val speculationScheduler =
8282
ThreadUtils.newDaemonSingleThreadScheduledExecutor("task-scheduler-speculation")
8383

84+
// whether to prefer assigning tasks to executors that contain shuffle files
85+
val shuffleBiasedTaskSchedulingEnabled =
86+
conf.getBoolean("spark.scheduler.shuffleBiasedTaskScheduling.enabled", false)
87+
8488
// Threshold above which we warn user initial TaskSet may be starved
8589
val STARVATION_TIMEOUT_MS = conf.getTimeAsMs("spark.starvation.timeout", "15s")
8690

@@ -377,11 +381,7 @@ private[spark] class TaskSchedulerImpl(
377381
}
378382
}.getOrElse(offers)
379383

380-
val shuffledOffers = shuffleOffers(filteredOffers)
381-
// Build a list of tasks to assign to each worker.
382-
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
383-
val availableCpus = shuffledOffers.map(o => o.cores).toArray
384-
val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
384+
var tasks: Seq[Seq[TaskDescription]] = Nil
385385
val sortedTaskSets = rootPool.getSortedTaskSetQueue
386386
for (taskSet <- sortedTaskSets) {
387387
logDebug("parentName: %s, name: %s, runningTasks: %s".format(
@@ -391,11 +391,36 @@ private[spark] class TaskSchedulerImpl(
391391
}
392392
}
393393

394+
// If shuffle-biased task scheduling is enabled, then first assign as many tasks as possible to
395+
// executors containing active shuffle files, followed by assigning to executors with inactive
396+
// shuffle files, and then finally to those without shuffle files. This bin packing allows for
397+
// more efficient dynamic allocation in the absence of an external shuffle service.
398+
val partitionedAndShuffledOffers = partitionAndShuffleOffers(filteredOffers)
399+
for (shuffledOffers <- partitionedAndShuffledOffers.map(_._2)) {
400+
tasks ++= doResourceOffers(shuffledOffers, sortedTaskSets)
401+
}
402+
403+
// TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
404+
// launched within a configured time.
405+
if (tasks.size > 0) {
406+
hasLaunchedTask = true
407+
}
408+
return tasks
409+
}
410+
411+
private def doResourceOffers(
412+
shuffledOffers: IndexedSeq[WorkerOffer],
413+
sortedTaskSets: IndexedSeq[TaskSetManager]): Seq[Seq[TaskDescription]] = {
414+
// Build a list of tasks to assign to each worker.
415+
val tasks = shuffledOffers.map(o => new ArrayBuffer[TaskDescription](o.cores / CPUS_PER_TASK))
416+
val availableCpus = shuffledOffers.map(o => o.cores).toArray
417+
val availableSlots = shuffledOffers.map(o => o.cores / CPUS_PER_TASK).sum
418+
394419
// Take each TaskSet in our scheduling order, and then offer it each node in increasing order
395420
// of locality levels so that it gets a chance to launch local tasks on all of them.
396421
// NOTE: the preferredLocality order: PROCESS_LOCAL, NODE_LOCAL, NO_PREF, RACK_LOCAL, ANY
397422
for (taskSet <- sortedTaskSets) {
398-
// Skip the barrier taskSet if the available slots are less than the number of pending tasks.
423+
// Skip the barrier taskSet if the available slots are less than the number of pending tasks
399424
if (taskSet.isBarrier && availableSlots < taskSet.numTasks) {
400425
// Skip the launch process.
401426
// TODO SPARK-24819 If the job requires more slots than available (both busy and free
@@ -439,25 +464,40 @@ private[spark] class TaskSchedulerImpl(
439464
.mkString(",")
440465
addressesWithDescs.foreach(_._2.properties.setProperty("addresses", addressesStr))
441466

442-
logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for barrier " +
443-
s"stage ${taskSet.stageId}.")
467+
logInfo(s"Successfully scheduled all the ${addressesWithDescs.size} tasks for " +
468+
s"barrier stage ${taskSet.stageId}.")
444469
}
445470
}
446471
}
472+
tasks
473+
}
447474

448-
// TODO SPARK-24823 Cancel a job that contains barrier stage(s) if the barrier tasks don't get
449-
// launched within a configured time.
450-
if (tasks.size > 0) {
451-
hasLaunchedTask = true
475+
/**
476+
* Shuffle offers around to avoid always placing tasks on the same workers.
477+
* If shuffle-biased task scheduling is enabled, this function partitions the offers based on
478+
* whether they have active/inactive/no shuffle files present.
479+
*/
480+
def partitionAndShuffleOffers(offers: IndexedSeq[WorkerOffer])
481+
: IndexedSeq[(ExecutorShuffleStatus.Value, IndexedSeq[WorkerOffer])] = {
482+
if (shuffleBiasedTaskSchedulingEnabled && offers.length > 1) {
483+
// bias towards executors that have active shuffle outputs
484+
val execShuffles = mapOutputTracker.getExecutorShuffleStatus
485+
offers
486+
.groupBy(offer => execShuffles.getOrElse(offer.executorId, ExecutorShuffleStatus.Unknown))
487+
.mapValues(doShuffleOffers)
488+
.toStream
489+
.sortBy(_._1) // order: Active, Inactive, Unknown
490+
.toIndexedSeq
491+
} else {
492+
IndexedSeq((ExecutorShuffleStatus.Unknown, doShuffleOffers(offers)))
452493
}
453-
return tasks
454494
}
455495

456496
/**
457-
* Shuffle offers around to avoid always placing tasks on the same workers. Exposed to allow
458-
* overriding in tests, so it can be deterministic.
497+
* Does the shuffling for [[partitionAndShuffleOffers()]]. Exposed to allow overriding in tests,
498+
* so that it can be deterministic.
459499
*/
460-
protected def shuffleOffers(offers: IndexedSeq[WorkerOffer]): IndexedSeq[WorkerOffer] = {
500+
protected def doShuffleOffers(offers: IndexedSeq[WorkerOffer]): IndexedSeq[WorkerOffer] = {
461501
Random.shuffle(offers)
462502
}
463503

core/src/test/scala/org/apache/spark/scheduler/TaskSchedulerImplSuite.scala

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ import org.scalatest.BeforeAndAfterEach
2727
import org.scalatest.mockito.MockitoSugar
2828

2929
import org.apache.spark._
30+
import org.apache.spark.ExecutorShuffleStatus._
3031
import org.apache.spark.internal.Logging
3132
import org.apache.spark.internal.config
33+
import org.apache.spark.storage.BlockManagerId
3234
import org.apache.spark.util.ManualClock
3335

3436
class FakeSchedulerBackend extends SchedulerBackend {
@@ -836,7 +838,7 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
836838
// We customize the task scheduler just to let us control the way offers are shuffled, so we
837839
// can be sure we try both permutations, and to control the clock on the tasksetmanager.
838840
val taskScheduler = new TaskSchedulerImpl(sc) {
839-
override def shuffleOffers(offers: IndexedSeq[WorkerOffer]): IndexedSeq[WorkerOffer] = {
841+
override def doShuffleOffers(offers: IndexedSeq[WorkerOffer]): IndexedSeq[WorkerOffer] = {
840842
// Don't shuffle the offers around for this test. Instead, we'll just pass in all
841843
// the permutations we care about directly.
842844
offers
@@ -873,6 +875,40 @@ class TaskSchedulerImplSuite extends SparkFunSuite with LocalSparkContext with B
873875
}
874876
}
875877

878+
test("Shuffle-biased task scheduling enabled should lead to non-random offer shuffling") {
879+
setupScheduler("spark.scheduler.shuffleBiasedTaskScheduling.enabled" -> "true")
880+
881+
// Make offers in different executors, so they can be a mix of active, inactive, unknown
882+
val offers = IndexedSeq(
883+
WorkerOffer("exec1", "host1", 2), // inactive
884+
WorkerOffer("exec2", "host2", 2), // active
885+
WorkerOffer("exec3", "host3", 2) // unknown
886+
)
887+
val makeMapStatus = (offer: WorkerOffer) =>
888+
MapStatus(BlockManagerId(offer.executorId, offer.host, 1), Array(10))
889+
val mapOutputTracker = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
890+
mapOutputTracker.registerShuffle(0, 2)
891+
mapOutputTracker.registerShuffle(1, 1)
892+
mapOutputTracker.registerMapOutput(0, 0, makeMapStatus(offers(0)))
893+
mapOutputTracker.registerMapOutput(0, 1, makeMapStatus(offers(1)))
894+
mapOutputTracker.registerMapOutput(1, 0, makeMapStatus(offers(1)))
895+
mapOutputTracker.markShuffleInactive(0)
896+
897+
val execStatus = mapOutputTracker.getExecutorShuffleStatus
898+
assert(execStatus.equals(Map("exec1" -> Inactive, "exec2" -> Active)))
899+
900+
assert(taskScheduler.partitionAndShuffleOffers(offers).map(_._1)
901+
.equals(IndexedSeq(Active, Inactive, Unknown)))
902+
assert(taskScheduler.partitionAndShuffleOffers(offers).flatMap(_._2).map(offers.indexOf(_))
903+
.equals(IndexedSeq(1, 0, 2)))
904+
905+
taskScheduler.submitTasks(FakeTask.createTaskSet(3, stageId = 1, stageAttemptId = 0))
906+
// should go to active first, then inactive
907+
val taskDescs = taskScheduler.resourceOffers(offers).flatten
908+
assert(taskDescs.size === 3)
909+
assert(taskDescs.map(_.executorId).equals(Seq("exec2", "exec2", "exec1")))
910+
}
911+
876912
test("With delay scheduling off, tasks can be run at any locality level immediately") {
877913
val conf = new SparkConf()
878914
.set("spark.locality.wait", "0")

resource-managers/kubernetes/integration-tests/src/test/scala/org/apache/spark/deploy/k8s/integrationtest/DynamicAllocationTestsSuite.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ private[spark] trait DynamicAllocationTestsSuite { k8sSuite: KubernetesSuite =>
8383
.addToArgs("--conf", "spark.dynamicAllocation.enabled=true")
8484
.addToArgs("--conf", "spark.dynamicAllocation.minExecutors=0")
8585
.addToArgs("--conf", "spark.dynamicAllocation.maxExecutors=1")
86+
.addToArgs("--conf", "spark.scheduler.shuffleBiasedTaskScheduling.enabled=true")
8687
.addToArgs("--conf",
8788
s"spark.driver.host=" +
8889
s"${driverService.getMetadata.getName}.${kubernetesTestComponents.namespace}.svc")

0 commit comments

Comments
 (0)