Skip to content

Commit e55953b

Browse files
gaborgsomogyiMarcelo Vanzin
authored andcommitted
[SPARK-24022][TEST] Make SparkContextSuite not flaky
## What changes were proposed in this pull request? SparkContextSuite.test("Cancelling stages/jobs with custom reasons.") could stay in an infinite loop because of the problem found and fixed in [SPARK-23775](https://issues.apache.org/jira/browse/SPARK-23775). This PR solves this mentioned flakyness by removing shared variable usages when cancel happens in a loop and using wait and CountDownLatch for synhronization. ## How was this patch tested? Existing unit test. Author: Gabor Somogyi <[email protected]> Closes apache#21105 from gaborgsomogyi/SPARK-24022.
1 parent 9ea8d3d commit e55953b

File tree

1 file changed

+26
-35
lines changed

1 file changed

+26
-35
lines changed

core/src/test/scala/org/apache/spark/SparkContextSuite.scala

Lines changed: 26 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ package org.apache.spark
2020
import java.io.File
2121
import java.net.{MalformedURLException, URI}
2222
import java.nio.charset.StandardCharsets
23-
import java.util.concurrent.{Semaphore, TimeUnit}
23+
import java.util.concurrent.{CountDownLatch, Semaphore, TimeUnit}
2424

2525
import scala.concurrent.duration._
2626

@@ -498,45 +498,36 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
498498

499499
test("Cancelling stages/jobs with custom reasons.") {
500500
sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
501+
sc.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "true")
501502
val REASON = "You shall not pass"
502-
val slices = 10
503503

504-
val listener = new SparkListener {
505-
override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
506-
if (SparkContextSuite.cancelStage) {
507-
eventually(timeout(10.seconds)) {
508-
assert(SparkContextSuite.isTaskStarted)
504+
for (cancelWhat <- Seq("stage", "job")) {
505+
// This countdown latch used to make sure stage or job canceled in listener
506+
val latch = new CountDownLatch(1)
507+
508+
val listener = cancelWhat match {
509+
case "stage" =>
510+
new SparkListener {
511+
override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
512+
sc.cancelStage(taskStart.stageId, REASON)
513+
latch.countDown()
514+
}
509515
}
510-
sc.cancelStage(taskStart.stageId, REASON)
511-
SparkContextSuite.cancelStage = false
512-
SparkContextSuite.semaphore.release(slices)
513-
}
514-
}
515-
516-
override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
517-
if (SparkContextSuite.cancelJob) {
518-
eventually(timeout(10.seconds)) {
519-
assert(SparkContextSuite.isTaskStarted)
516+
case "job" =>
517+
new SparkListener {
518+
override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
519+
sc.cancelJob(jobStart.jobId, REASON)
520+
latch.countDown()
521+
}
520522
}
521-
sc.cancelJob(jobStart.jobId, REASON)
522-
SparkContextSuite.cancelJob = false
523-
SparkContextSuite.semaphore.release(slices)
524-
}
525523
}
526-
}
527-
sc.addSparkListener(listener)
528-
529-
for (cancelWhat <- Seq("stage", "job")) {
530-
SparkContextSuite.semaphore.drainPermits()
531-
SparkContextSuite.isTaskStarted = false
532-
SparkContextSuite.cancelStage = (cancelWhat == "stage")
533-
SparkContextSuite.cancelJob = (cancelWhat == "job")
524+
sc.addSparkListener(listener)
534525

535526
val ex = intercept[SparkException] {
536-
sc.range(0, 10000L, numSlices = slices).mapPartitions { x =>
537-
SparkContextSuite.isTaskStarted = true
538-
// Block waiting for the listener to cancel the stage or job.
539-
SparkContextSuite.semaphore.acquire()
527+
sc.range(0, 10000L, numSlices = 10).mapPartitions { x =>
528+
x.synchronized {
529+
x.wait()
530+
}
540531
x
541532
}.count()
542533
}
@@ -550,9 +541,11 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
550541
fail("Expected the cause to be SparkException, got " + cause.toString() + " instead.")
551542
}
552543

544+
latch.await(20, TimeUnit.SECONDS)
553545
eventually(timeout(20.seconds)) {
554546
assert(sc.statusTracker.getExecutorInfos.map(_.numRunningTasks()).sum == 0)
555547
}
548+
sc.removeSparkListener(listener)
556549
}
557550
}
558551

@@ -637,8 +630,6 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
637630
}
638631

639632
object SparkContextSuite {
640-
@volatile var cancelJob = false
641-
@volatile var cancelStage = false
642633
@volatile var isTaskStarted = false
643634
@volatile var taskKilled = false
644635
@volatile var taskSucceeded = false

0 commit comments

Comments
 (0)