ifilonenko
diff --git a/‎core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
Lines changed: 1 addition & 0 deletions b/‎core/src/main/scala/org/apache/spark/MapOutputStatistics.scala
Lines changed: 1 addition & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
Lines changed: 15 additions & 9 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
Lines changed: 15 additions & 9 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
Lines changed: 15 additions & 1 deletion b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
Lines changed: 15 additions & 1 deletion
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
Lines changed: 2 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkPlanInfo.scala
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanQueryStage.scala
Lines changed: 79 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanQueryStage.scala
Lines changed: 79 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala
Lines changed: 222 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStage.scala
Lines changed: 222 additions & 0 deletions
@@ -25,3 +25,4 @@ package org.apache.spark
  *   (may be inexact due to use of compressed map statuses)
  */
 private[spark] class MapOutputStatistics(val shuffleId: Int, val bytesByPartitionId: Array[Long])
+  extends Serializable
@@ -280,14 +280,19 @@ object SQLConf {
 
   val SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS =
     buildConf("spark.sql.adaptive.minNumPostShufflePartitions")
-      .internal()
-      .doc("The advisory minimal number of post-shuffle partitions provided to " +
-        "ExchangeCoordinator. This setting is used in our test to make sure we " +
-        "have enough parallelism to expose issues that will not be exposed with a " +
-        "single partition. When the value is a non-positive value, this setting will " +
-        "not be provided to ExchangeCoordinator.")
+      .doc("The advisory minimum number of post-shuffle partitions used in adaptive execution.")
+      .intConf
+      .checkValue(numPartitions => numPartitions > 0, "The minimum shuffle partition number " +
+        "must be a positive integer.")
+      .createWithDefault(1)
+
+  val SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS =
+    buildConf("spark.sql.adaptive.maxNumPostShufflePartitions")
+      .doc("The advisory maximum number of post-shuffle partitions used in adaptive execution.")
       .intConf
-      .createWithDefault(-1)
+      .checkValue(numPartitions => numPartitions > 0, "The maximum shuffle partition number " +
+        "must be a positive integer.")
+      .createWithDefault(500)
 
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
@@ -1698,8 +1703,9 @@ class SQLConf extends Serializable with Logging {
 
   def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED)
 
-  def minNumPostShufflePartitions: Int =
-    getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS)
+  def minNumPostShufflePartitions: Int = getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS)
+
+  def maxNumPostShufflePartitions: Int = getConf(SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS)
 
   def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN)
 
 
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
+import org.apache.spark.sql.execution.adaptive.PlanQueryStage
 import org.apache.spark.sql.execution.command.{DescribeTableCommand, ExecutedCommandExec, ShowTablesCommand}
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
 import org.apache.spark.sql.types.{BinaryType, DateType, DecimalType, TimestampType, _}
@@ -84,7 +85,11 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
    * row format conversions as needed.
    */
   protected def prepareForExecution(plan: SparkPlan): SparkPlan = {
-    preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) }
+    if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) {
+      adaptivePreparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp)}
+    } else {
+      preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp)}
+    }
   }
 
   /** A sequence of rules that will be applied in order to the physical plan before execution. */
@@ -95,6 +100,15 @@ class QueryExecution(val sparkSession: SparkSession, val logical: LogicalPlan) {
     ReuseExchange(sparkSession.sessionState.conf),
     ReuseSubquery(sparkSession.sessionState.conf))
 
+  protected def adaptivePreparations: Seq[Rule[SparkPlan]] = Seq(
+    PlanSubqueries(sparkSession),
+    EnsureRequirements(sparkSession.sessionState.conf),
+    ReuseSubquery(sparkSession.sessionState.conf),
+    // PlanQueryStage needs to be the last rule because it divides the plan into multiple sub-trees
+    // by inserting leaf node QueryStageInput. Transforming the plan after applying this rule will
+    // only transform node in a sub-tree.
+    PlanQueryStage(sparkSession.sessionState.conf))
+
   protected def stringOrError[A](f: => A): String =
     try f.toString catch { case e: AnalysisException => e.toString }
 
 
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.execution.adaptive.QueryStageInput
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.metric.SQLMetricInfo
 
@@ -51,6 +52,7 @@ private[execution] object SparkPlanInfo {
   def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = {
     val children = plan match {
       case ReusedExchangeExec(_, child) => child :: Nil
+      case i: QueryStageInput => i.childStage :: Nil
       case _ => plan.children ++ plan.subqueries
     }
     val metrics = plan.metrics.toSeq.map { case (key, metric) =>
 
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.command.ExecutedCommandExec
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ShuffleExchangeExec}
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.StructType
+
+/**
+ * Divide the spark plan into multiple QueryStages. For each Exchange in the plan, it adds a
+ * QueryStage and a QueryStageInput. If reusing Exchange is enabled, it finds duplicated exchanges
+ * and uses the same QueryStage for all the references.
+ */
+case class PlanQueryStage(conf: SQLConf) extends Rule[SparkPlan] {
+
+  def apply(plan: SparkPlan): SparkPlan = {
+
+    val newPlan = if (!conf.exchangeReuseEnabled) {
+      plan.transformUp {
+        case e: ShuffleExchangeExec =>
+          ShuffleQueryStageInput(ShuffleQueryStage(e), e.output)
+        case e: BroadcastExchangeExec =>
+          BroadcastQueryStageInput(BroadcastQueryStage(e), e.output)
+      }
+    } else {
+      // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
+      val stages = mutable.HashMap[StructType, ArrayBuffer[QueryStage]]()
+
+      plan.transformUp {
+        case exchange: Exchange =>
+          val sameSchema = stages.getOrElseUpdate(exchange.schema, ArrayBuffer[QueryStage]())
+          val samePlan = sameSchema.find { s =>
+            exchange.sameResult(s.child)
+          }
+          if (samePlan.isDefined) {
+            // Keep the output of this exchange, the following plans require that to resolve
+            // attributes.
+            exchange match {
+              case e: ShuffleExchangeExec => ShuffleQueryStageInput(
+                samePlan.get.asInstanceOf[ShuffleQueryStage], exchange.output)
+              case e: BroadcastExchangeExec => BroadcastQueryStageInput(
+                samePlan.get.asInstanceOf[BroadcastQueryStage], exchange.output)
+            }
+          } else {
+            val queryStageInput = exchange match {
+              case e: ShuffleExchangeExec =>
+                ShuffleQueryStageInput(ShuffleQueryStage(e), e.output)
+              case e: BroadcastExchangeExec =>
+                BroadcastQueryStageInput(BroadcastQueryStage(e), e.output)
+            }
+            sameSchema += queryStageInput.childStage
+            queryStageInput
+          }
+      }
+    }
+    ResultQueryStage(newPlan)
+  }
+}
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import scala.concurrent.{ExecutionContext, Future}
+import scala.concurrent.duration.Duration
+
+import org.apache.spark.MapOutputStatistics
+import org.apache.spark.broadcast
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.exchange._
+import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate
+import org.apache.spark.util.ThreadUtils
+
+/**
+ * In adaptive execution mode, an execution plan is divided into multiple QueryStages. Each
+ * QueryStage is a sub-tree that runs in a single stage.
+ */
+abstract class QueryStage extends UnaryExecNode {
+
+  var child: SparkPlan
+
+  // Ignore this wrapper for canonicalizing.
+  override def doCanonicalize(): SparkPlan = child.canonicalized
+
+  override def output: Seq[Attribute] = child.output
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+
+  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+
+  /**
+   * Execute childStages and wait until all stages are completed. Use a thread pool to avoid
+   * blocking on one child stage.
+   */
+  def executeChildStages(): Unit = {
+    // Handle broadcast stages
+    val broadcastQueryStages: Seq[BroadcastQueryStage] = child.collect {
+      case bqs: BroadcastQueryStageInput => bqs.childStage
+    }
+    val broadcastFutures = broadcastQueryStages.map { queryStage =>
+      Future { queryStage.prepareBroadcast() }(QueryStage.executionContext)
+    }
+
+    // Submit shuffle stages
+    val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    val shuffleQueryStages: Seq[ShuffleQueryStage] = child.collect {
+      case sqs: ShuffleQueryStageInput => sqs.childStage
+    }
+    val shuffleStageFutures = shuffleQueryStages.map { queryStage =>
+      Future {
+        SQLExecution.withExecutionId(sqlContext.sparkSession, executionId) {
+          queryStage.execute()
+        }
+      }(QueryStage.executionContext)
+    }
+
+    ThreadUtils.awaitResult(
+      Future.sequence(broadcastFutures)(implicitly, QueryStage.executionContext), Duration.Inf)
+    ThreadUtils.awaitResult(
+      Future.sequence(shuffleStageFutures)(implicitly, QueryStage.executionContext), Duration.Inf)
+  }
+
+  /**
+   * Before executing the plan in this query stage, we execute all child stages, optimize the plan
+   * in this stage and determine the reducer number based on the child stages' statistics. Finally
+   * we do a codegen for this query stage and update the UI with the new plan.
+   */
+  def prepareExecuteStage(): Unit = {
+    // 1. Execute childStages
+    executeChildStages()
+    // It is possible to optimize this stage's plan here based on the child stages' statistics.
+
+    // 2. Determine reducer number
+    val queryStageInputs: Seq[ShuffleQueryStageInput] = child.collect {
+      case input: ShuffleQueryStageInput => input
+    }
+    val childMapOutputStatistics = queryStageInputs.map(_.childStage.mapOutputStatistics)
+      .filter(_ != null).toArray
+    if (childMapOutputStatistics.length > 0) {
+      val exchangeCoordinator = new ExchangeCoordinator(
+        conf.targetPostShuffleInputSize,
+        conf.minNumPostShufflePartitions)
+
+      val partitionStartIndices =
+        exchangeCoordinator.estimatePartitionStartIndices(childMapOutputStatistics)
+      child = child.transform {
+        case ShuffleQueryStageInput(childStage, output, _) =>
+          ShuffleQueryStageInput(childStage, output, Some(partitionStartIndices))
+      }
+    }
+
+    // 3. Codegen and update the UI
+    child = CollapseCodegenStages(sqlContext.conf).apply(child)
+    val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)
+    if (executionId != null && executionId.nonEmpty) {
+      val queryExecution = SQLExecution.getQueryExecution(executionId.toLong)
+      sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate(
+        executionId.toLong,
+        queryExecution.toString,
+        SparkPlanInfo.fromSparkPlan(queryExecution.executedPlan)))
+    }
+  }
+
+  // Caches the created ShuffleRowRDD so we can reuse that.
+  private var cachedRDD: RDD[InternalRow] = null
+
+  def executeStage(): RDD[InternalRow] = child.execute()
+
+  /**
+   * A QueryStage can be reused like Exchange. It is possible that multiple threads try to submit
+   * the same QueryStage. Use synchronized to make sure it is executed only once.
+   */
+  override def doExecute(): RDD[InternalRow] = synchronized {
+    if (cachedRDD == null) {
+      prepareExecuteStage()
+      cachedRDD = executeStage()
+    }
+    cachedRDD
+  }
+
+  override def executeCollect(): Array[InternalRow] = {
+    prepareExecuteStage()
+    child.executeCollect()
+  }
+
+  override def executeToIterator(): Iterator[InternalRow] = {
+    prepareExecuteStage()
+    child.executeToIterator()
+  }
+
+  override def executeTake(n: Int): Array[InternalRow] = {
+    prepareExecuteStage()
+    child.executeTake(n)
+  }
+
+  override def generateTreeString(
+      depth: Int,
+      lastChildren: Seq[Boolean],
+      builder: StringBuilder,
+      verbose: Boolean,
+      prefix: String = "",
+      addSuffix: Boolean = false): StringBuilder = {
+    child.generateTreeString(depth, lastChildren, builder, verbose, "*")
+  }
+}
+
+/**
+ * The last QueryStage of an execution plan.
+ */
+case class ResultQueryStage(var child: SparkPlan) extends QueryStage
+
+/**
+ * A shuffle QueryStage whose child is a ShuffleExchange.
+ */
+case class ShuffleQueryStage(var child: SparkPlan) extends QueryStage {
+
+  protected var _mapOutputStatistics: MapOutputStatistics = null
+
+  def mapOutputStatistics: MapOutputStatistics = _mapOutputStatistics
+
+  override def executeStage(): RDD[InternalRow] = {
+    child match {
+      case e: ShuffleExchangeExec =>
+        val result = e.eagerExecute()
+        _mapOutputStatistics = e.mapOutputStatistics
+        result
+      case _ => throw new IllegalArgumentException(
+        "The child of ShuffleQueryStage must be a ShuffleExchange.")
+    }
+  }
+}
+
+/**
+ * A broadcast QueryStage whose child is a BroadcastExchangeExec.
+ */
+case class BroadcastQueryStage(var child: SparkPlan) extends QueryStage {
+  override def doExecuteBroadcast[T](): broadcast.Broadcast[T] = {
+    child.executeBroadcast()
+  }
+
+  private var prepared = false
+
+  def prepareBroadcast() : Unit = synchronized {
+    if (!prepared) {
+      executeChildStages()
+      child = CollapseCodegenStages(sqlContext.conf).apply(child)
+      // After child stages are completed, prepare() triggers the broadcast.
+      prepare()
+      prepared = true
+    }
+  }
+
+  override def doExecute(): RDD[InternalRow] = {
+    throw new UnsupportedOperationException(
+      "BroadcastExchange does not support the execute() code path.")
+  }
+}
+
+object QueryStage {
+  private[execution] val executionContext = ExecutionContext.fromExecutorService(
+    ThreadUtils.newDaemonCachedThreadPool("adaptive-query-stage"))
+}