Skip to content

Commit 87ca739

Browse files
HeartSaVioRzsxwing
authored andcommitted
[SPARK-24161][SS] Enable debug package feature on structured streaming
## What changes were proposed in this pull request? Currently, debug package has a implicit class "DebugQuery" which matches Dataset to provide debug features on Dataset class. It doesn't work with structured streaming: it requires query is already started, and the information can be retrieved from StreamingQuery, not Dataset. I guess that's why "explain" had to be placed to StreamingQuery whereas it already exists on Dataset. This patch adds a new implicit class "DebugStreamQuery" which matches StreamingQuery to provide similar debug features on StreamingQuery class. ## How was this patch tested? Added relevant unit tests. Author: Jungtaek Lim <[email protected]> Closes apache#21222 from HeartSaVioR/SPARK-24161.
1 parent 3c96937 commit 87ca739

File tree

2 files changed

+173
-2
lines changed

2 files changed

+173
-2
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/debug/package.scala

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
2929
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode}
3030
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
3131
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
32+
import org.apache.spark.sql.execution.streaming.{StreamExecution, StreamingQueryWrapper}
33+
import org.apache.spark.sql.execution.streaming.continuous.WriteToContinuousDataSourceExec
34+
import org.apache.spark.sql.streaming.StreamingQuery
3235
import org.apache.spark.util.{AccumulatorV2, LongAccumulator}
3336

3437
/**
@@ -40,6 +43,16 @@ import org.apache.spark.util.{AccumulatorV2, LongAccumulator}
4043
* sql("SELECT 1").debug()
4144
* sql("SELECT 1").debugCodegen()
4245
* }}}
46+
*
47+
* or for streaming case (structured streaming):
48+
* {{{
49+
* import org.apache.spark.sql.execution.debug._
50+
* val query = df.writeStream.<...>.start()
51+
* query.debugCodegen()
52+
* }}}
53+
*
54+
* Note that debug in structured streaming is not supported, because it doesn't make sense for
55+
* streaming to execute batch once while main query is running concurrently.
4356
*/
4457
package object debug {
4558

@@ -88,14 +101,50 @@ package object debug {
88101
}
89102
}
90103

104+
/**
105+
* Get WholeStageCodegenExec subtrees and the codegen in a query plan into one String
106+
*
107+
* @param query the streaming query for codegen
108+
* @return single String containing all WholeStageCodegen subtrees and corresponding codegen
109+
*/
110+
def codegenString(query: StreamingQuery): String = {
111+
val w = asStreamExecution(query)
112+
if (w.lastExecution != null) {
113+
codegenString(w.lastExecution.executedPlan)
114+
} else {
115+
"No physical plan. Waiting for data."
116+
}
117+
}
118+
119+
/**
120+
* Get WholeStageCodegenExec subtrees and the codegen in a query plan
121+
*
122+
* @param query the streaming query for codegen
123+
* @return Sequence of WholeStageCodegen subtrees and corresponding codegen
124+
*/
125+
def codegenStringSeq(query: StreamingQuery): Seq[(String, String)] = {
126+
val w = asStreamExecution(query)
127+
if (w.lastExecution != null) {
128+
codegenStringSeq(w.lastExecution.executedPlan)
129+
} else {
130+
Seq.empty
131+
}
132+
}
133+
134+
private def asStreamExecution(query: StreamingQuery): StreamExecution = query match {
135+
case wrapper: StreamingQueryWrapper => wrapper.streamingQuery
136+
case q: StreamExecution => q
137+
case _ => throw new IllegalArgumentException("Parameter should be an instance of " +
138+
"StreamExecution!")
139+
}
140+
91141
/**
92142
* Augments [[Dataset]]s with debug methods.
93143
*/
94144
implicit class DebugQuery(query: Dataset[_]) extends Logging {
95145
def debug(): Unit = {
96-
val plan = query.queryExecution.executedPlan
97146
val visited = new collection.mutable.HashSet[TreeNodeRef]()
98-
val debugPlan = plan transform {
147+
val debugPlan = query.queryExecution.executedPlan transform {
99148
case s: SparkPlan if !visited.contains(new TreeNodeRef(s)) =>
100149
visited += new TreeNodeRef(s)
101150
DebugExec(s)
@@ -116,6 +165,12 @@ package object debug {
116165
}
117166
}
118167

168+
implicit class DebugStreamQuery(query: StreamingQuery) extends Logging {
169+
def debugCodegen(): Unit = {
170+
debugPrint(codegenString(query))
171+
}
172+
}
173+
119174
case class DebugExec(child: SparkPlan) extends UnaryExecNode with CodegenSupport {
120175
def output: Seq[Attribute] = child.output
121176

sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import scala.util.control.ControlThrowable
2727
import com.google.common.util.concurrent.UncheckedExecutionException
2828
import org.apache.commons.io.FileUtils
2929
import org.apache.hadoop.conf.Configuration
30+
import org.scalatest.time.SpanSugar._
3031

3132
import org.apache.spark.{SparkConf, SparkContext}
3233
import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
@@ -35,6 +36,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Range
3536
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
3637
import org.apache.spark.sql.execution.command.ExplainCommand
3738
import org.apache.spark.sql.execution.streaming._
39+
import org.apache.spark.sql.execution.streaming.sources.ContinuousMemoryStream
3840
import org.apache.spark.sql.execution.streaming.state.{StateStore, StateStoreConf, StateStoreId, StateStoreProvider}
3941
import org.apache.spark.sql.functions._
4042
import org.apache.spark.sql.internal.SQLConf
@@ -513,6 +515,120 @@ class StreamSuite extends StreamTest {
513515
}
514516
}
515517

518+
test("explain-continuous") {
519+
val inputData = ContinuousMemoryStream[Int]
520+
val df = inputData.toDS().map(_ * 2).filter(_ > 5)
521+
522+
// Test `df.explain`
523+
val explain = ExplainCommand(df.queryExecution.logical, extended = false)
524+
val explainString =
525+
spark.sessionState
526+
.executePlan(explain)
527+
.executedPlan
528+
.executeCollect()
529+
.map(_.getString(0))
530+
.mkString("\n")
531+
assert(explainString.contains("Filter"))
532+
assert(explainString.contains("MapElements"))
533+
assert(!explainString.contains("LocalTableScan"))
534+
535+
// Test StreamingQuery.display
536+
val q = df.writeStream.queryName("memory_continuous_explain")
537+
.outputMode(OutputMode.Update()).format("memory")
538+
.trigger(Trigger.Continuous("1 seconds"))
539+
.start()
540+
.asInstanceOf[StreamingQueryWrapper]
541+
.streamingQuery
542+
try {
543+
// in continuous mode, the query will be run even there's no data
544+
// sleep a bit to ensure initialization
545+
eventually(timeout(2.seconds), interval(100.milliseconds)) {
546+
assert(q.lastExecution != null)
547+
}
548+
549+
val explainWithoutExtended = q.explainInternal(false)
550+
551+
// `extended = false` only displays the physical plan.
552+
assert("Streaming RelationV2 ContinuousMemoryStream".r
553+
.findAllMatchIn(explainWithoutExtended).size === 0)
554+
assert("ScanV2 ContinuousMemoryStream".r
555+
.findAllMatchIn(explainWithoutExtended).size === 1)
556+
557+
val explainWithExtended = q.explainInternal(true)
558+
// `extended = true` displays 3 logical plans (Parsed/Optimized/Optimized) and 1 physical
559+
// plan.
560+
assert("Streaming RelationV2 ContinuousMemoryStream".r
561+
.findAllMatchIn(explainWithExtended).size === 3)
562+
assert("ScanV2 ContinuousMemoryStream".r
563+
.findAllMatchIn(explainWithExtended).size === 1)
564+
} finally {
565+
q.stop()
566+
}
567+
}
568+
569+
test("codegen-microbatch") {
570+
val inputData = MemoryStream[Int]
571+
val df = inputData.toDS().map(_ * 2).filter(_ > 5)
572+
573+
// Test StreamingQuery.codegen
574+
val q = df.writeStream.queryName("memory_microbatch_codegen")
575+
.outputMode(OutputMode.Update)
576+
.format("memory")
577+
.trigger(Trigger.ProcessingTime("1 seconds"))
578+
.start()
579+
580+
try {
581+
import org.apache.spark.sql.execution.debug._
582+
assert("No physical plan. Waiting for data." === codegenString(q))
583+
assert(codegenStringSeq(q).isEmpty)
584+
585+
inputData.addData(1, 2, 3, 4, 5)
586+
q.processAllAvailable()
587+
588+
assertDebugCodegenResult(q)
589+
} finally {
590+
q.stop()
591+
}
592+
}
593+
594+
test("codegen-continuous") {
595+
val inputData = ContinuousMemoryStream[Int]
596+
val df = inputData.toDS().map(_ * 2).filter(_ > 5)
597+
598+
// Test StreamingQuery.codegen
599+
val q = df.writeStream.queryName("memory_continuous_codegen")
600+
.outputMode(OutputMode.Update)
601+
.format("memory")
602+
.trigger(Trigger.Continuous("1 seconds"))
603+
.start()
604+
605+
try {
606+
// in continuous mode, the query will be run even there's no data
607+
// sleep a bit to ensure initialization
608+
eventually(timeout(2.seconds), interval(100.milliseconds)) {
609+
assert(q.asInstanceOf[StreamingQueryWrapper].streamingQuery.lastExecution != null)
610+
}
611+
612+
assertDebugCodegenResult(q)
613+
} finally {
614+
q.stop()
615+
}
616+
}
617+
618+
private def assertDebugCodegenResult(query: StreamingQuery): Unit = {
619+
import org.apache.spark.sql.execution.debug._
620+
621+
val codegenStr = codegenString(query)
622+
assert(codegenStr.contains("Found 1 WholeStageCodegen subtrees."))
623+
// assuming that code is generated for the test query
624+
assert(codegenStr.contains("Generated code:"))
625+
626+
val codegenStrSeq = codegenStringSeq(query)
627+
assert(codegenStrSeq.nonEmpty)
628+
assert(codegenStrSeq.head._1.contains("*(1)"))
629+
assert(codegenStrSeq.head._2.contains("codegenStageId=1"))
630+
}
631+
516632
test("SPARK-19065: dropDuplicates should not create expressions using the same id") {
517633
withTempPath { testPath =>
518634
val data = Seq((1, 2), (2, 3), (3, 4))

0 commit comments

Comments
 (0)